### Define Data Quality KPIs

**Task 1**: Identify Relevant KPIs

**Objective**: Develop KPIs that align with organizational goals.

**Steps**:
1. Choose a dataset from a domain of your interest (e.g., sales data, healthcare records, or transaction logs).
2. Identify three KPIs that would be crucial for assessing the data quality in your chosen dataset. Consider accuracy, completeness, and timeliness.
3. Document why each KPI is important for maintaining high-quality data in your given context.

In [1]:
# Write your code from here

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from dash import Dash, dcc, html
import plotly.graph_objs as go

# ----------- Simulate Sales Dataset -------------

np.random.seed(42)

def generate_sample_sales_data(num_records=100):
    base_date = datetime.today()
    data = {
        "OrderID": np.arange(1, num_records + 1),
        "CustomerID": np.random.randint(1000, 1020, size=num_records),
        "OrderDate": [base_date - timedelta(days=np.random.randint(0, 30)) for _ in range(num_records)],
        "DeliveryDate": [],
        "Amount": np.round(np.random.uniform(10, 500, num_records), 2),
        "IsDelivered": np.random.choice([True, False], size=num_records, p=[0.85, 0.15])
    }

    # Add DeliveryDate or None depending on IsDelivered
    for i in range(num_records):
        if data["IsDelivered"][i]:
            delivery_delay = np.random.randint(1, 7)  # days delay
            data["DeliveryDate"].append(data["OrderDate"][i] + timedelta(days=delivery_delay))
        else:
            data["DeliveryDate"].append(None)

    # Introduce some missing Amount values for completeness KPI
    missing_indices = np.random.choice(num_records, size=5, replace=False)
    for idx in missing_indices:
        data["Amount"][idx] = None

    # Introduce some inaccurate amounts (negative values) for accuracy KPI
    inaccurate_indices = np.random.choice(num_records, size=3, replace=False)
    for idx in inaccurate_indices:
        data["Amount"][idx] = -abs(data["Amount"][idx]) if data["Amount"][idx] is not None else None

    return pd.DataFrame(data)

df = generate_sample_sales_data()

# -------------- Define KPIs ---------------------

def completeness(df, column):
    total = len(df)
    non_null = df[column].notnull().sum()
    return round(non_null / total * 100, 2)

def accuracy_amount(df):
    # Accuracy: % of non-negative amounts
    valid_amounts = df["Amount"].dropna()
    accurate = valid_amounts[valid_amounts >= 0].count()
    return round(accurate / len(valid_amounts) * 100, 2) if len(valid_amounts) > 0 else 0

def timeliness(df):
    # Timeliness: % of delivered orders delivered within 5 days of order date
    delivered = df[df["IsDelivered"] & df["DeliveryDate"].notnull()]
    if len(delivered) == 0:
        return 0
    on_time = delivered[(delivered["DeliveryDate"] - delivered["OrderDate"]).dt.days <= 5]
    return round(len(on_time) / len(delivered) * 100, 2)

# Calculate KPI values
kpi_values = {
    "Completeness (Amount)": completeness(df, "Amount"),
    "Accuracy (Amount)": accuracy_amount(df),
    "Timeliness (Delivery)": timeliness(df),
}

print("KPI Values:")
for k, v in kpi_values.items():
    print(f"{k}: {v}%")

# ----------- Build Dash KPI Dashboard -------------

app = Dash(__name__)

app.layout = html.Div([
    html.H2("Sales Data Quality KPIs Dashboard"),
    html.Div([
        html.Div([
            html.H4("Completeness (Amount)"),
            html.P(f"{kpi_values['Completeness (Amount)']}%", style={"fontSize": "24px", "fontWeight": "bold"})
        ], style={"width": "30%", "display": "inline-block", "padding": "10px", "border": "1px solid #ccc", "margin": "10px"}),

        html.Div([
            html.H4("Accuracy (Amount)"),
            html.P(f"{kpi_values['Accuracy (Amount)']}%", style={"fontSize": "24px", "fontWeight": "bold"})
        ], style={"width": "30%", "display": "inline-block", "padding": "10px", "border": "1px solid #ccc", "margin": "10px"}),

        html.Div([
            html.H4("Timeliness (Delivery)"),
            html.P(f"{kpi_values['Timeliness (Delivery)']}%", style={"fontSize": "24px", "fontWeight": "bold"})
        ], style={"width": "30%", "display": "inline-block", "padding": "10px", "border": "1px solid #ccc", "margin": "10px"}),
    ]),
    dcc.Graph(
        id='kpi-bar-chart',
        figure={
            "data": [
                go.Bar(
                    x=list(kpi_values.keys()),
                    y=list(kpi_values.values()),
                    text=[f"{v}%" for v in kpi_values.values()],
                    textposition='auto',
                    marker_color=['#1f77b4', '#ff7f0e', '#2ca02c']
                )
            ],
            "layout": go.Layout(
                title="KPI Scores",
                yaxis=dict(range=[0, 110]),
                margin=dict(l=40, r=40, t=40, b=40)
            )
        }
    )
])

if __name__ == '__main__':
    app.run_server(debug=True)

KPI Values:
Completeness (Amount): 95.0%
Accuracy (Amount): 96.84%
Timeliness (Delivery): 93.9%


ObsoleteAttributeException: app.run_server has been replaced by app.run

**Task 2**: Develop a KPI Dashboard

**Objective**: Visualize your KPIs for better monitoring.

**Steps**:
1. Use a tool like Excel or a BI tool (e.g., Tableau, Power BI) to create a simple dashboard.
2. Input sample data and visualize your chosen KPIs, showing how they would be monitored.
3. Share your dashboard with peers and gather feedback on KPI relevance and clarity.

In [None]:
# Write your code from here

pip install pandas plotly dash
