## Defining Data Quality SLAs
### Data Completeness
**Description**: Set an SLA that ensures that 95% of data fields in your dataset are filled (non-null values). Practice by checking a dataset of your choice and calculate its completeness.

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import unittest

# -----------------------------
# SLA Check Functions
# -----------------------------

def check_data_completeness(df: pd.DataFrame, threshold: float = 0.95) -> bool:
    """Check if non-null ratio in DataFrame meets threshold."""
    total = df.size
    non_null = df.count().sum()
    completeness = non_null / total
    print(f"[Completeness] Non-null: {completeness:.2%}")
    return completeness >= threshold

def check_data_timeliness(df: pd.DataFrame, column: str, max_age_hours: int = 24) -> bool:
    """Check if all timestamp values in the given column are within max_age_hours."""
    try:
        cutoff = datetime.now() - timedelta(hours=max_age_hours)
        on_time_ratio = (df[column] >= cutoff).mean()
        print(f"[Timeliness] On-time data: {on_time_ratio:.2%}")
        return on_time_ratio == 1.0
    except Exception as e:
        print(f"Error in timeliness check: {e}")
        return False

def check_data_consistency(main_df: pd.DataFrame, ref_df: pd.DataFrame, key: str, threshold: float = 0.99) -> bool:
    """Check if key values in main_df are found in ref_df at least threshold% of the time."""
    try:
        match_ratio = main_df[key].isin(ref_df[key]).mean()
        print(f"[Consistency] Matching keys: {match_ratio:.2%}")
        return match_ratio >= threshold
    except Exception as e:
        print(f"Error in consistency check: {e}")
        return False

# -----------------------------
# Data for Demo and Testing
# -----------------------------
def generate_test_data():
    now = datetime.now()
    main_data = pd.DataFrame({
        'id': [1, 2, 3, 4, 5],
        'value': [10, 20, None, 30, 40],
        'timestamp': [
            now - timedelta(hours=1),
            now - timedelta(hours=2),
            now - timedelta(hours=3),
            now - timedelta(hours=25),  # SLA breach
            now
        ]
    })
    ref_data = pd.DataFrame({
        'id': [1, 2, 3, 4]  # Missing id 5
    })
    return main_data, ref_data

# -----------------------------
# Unit Tests
# -----------------------------
class TestSLAFunctions(unittest.TestCase):

    def setUp(self):
        self.main_df, self.ref_df = generate_test_data()

    def test_completeness_pass(self):
        df = self.main_df.fillna(0)
        self.assertTrue(check_data_completeness(df))

    def test_completeness_fail(self):
        self.assertFalse(check_data_completeness(self.main_df))

    def test_timeliness_pass(self):
        df = self.main_df.copy()
        df['timestamp'] = datetime.now()
        self.assertTrue(check_data_timeliness(df, 'timestamp'))

    def test_timeliness_fail(self):
        self.assertFalse(check_data_timeliness(self.main_df, 'timestamp'))

    def test_consistency_pass(self):
        df = self.main_df[self.main_df['id'] != 5]  # remove inconsistent ID
        self.assertTrue(check_data_consistency(df, self.ref_df, 'id'))

    def test_consistency_fail(self):
        self.assertFalse(check_data_consistency(self.main_df, self.ref_df, 'id'))

# -----------------------------
# Run Demo and Tests
# -----------------------------
if __name__ == "__main__":
    print("=== SLA Demo ===")
    main_df, ref_df = generate_test_data()

    # SLA Checks
    check_data_completeness(main_df)
    check_data_timeliness(main_df, 'timestamp')
    check_data_consistency(main_df, ref_df, 'id')

    print("\n=== Running Unit Tests ===")
    unittest.main(argv=[''], exit=False)

......
----------------------------------------------------------------------
Ran 6 tests in 0.011s

OK


=== SLA Demo ===
[Completeness] Non-null: 93.33%
[Timeliness] On-time data: 80.00%
[Consistency] Matching keys: 80.00%

=== Running Unit Tests ===
[Completeness] Non-null: 93.33%
[Completeness] Non-null: 100.00%
[Consistency] Matching keys: 80.00%
[Consistency] Matching keys: 100.00%
[Timeliness] On-time data: 80.00%
[Timeliness] On-time data: 100.00%


In [4]:
# write your code from here

import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# -----------------------------
# Generate example datasets
# -----------------------------

# Dataset with some missing values
data_main = pd.DataFrame({
    'id': [1, 2, 3, 4, 5],
    'value': [10, 15, np.nan, 30, 25],
    'timestamp': [
        datetime.now() - timedelta(hours=2),
        datetime.now() - timedelta(hours=3),
        datetime.now() - timedelta(hours=25),  # SLA violation
        datetime.now() - timedelta(hours=1),
        datetime.now()
    ]
})

# Related dataset to check consistency (foreign key reference)
data_ref = pd.DataFrame({
    'id': [1, 2, 3, 4]  # missing ID 5
})


# -----------------------------
# SLA 1: Data Completeness
# -----------------------------
def check_completeness(df, threshold=0.95):
    """
    SLA: At least `threshold`% of the values in the dataset must be non-null.
    """
    total_values = df.size
    non_null_values = df.count().sum()
    completeness_ratio = non_null_values / total_values
    print(f"[Completeness Check] Non-null ratio: {completeness_ratio:.2%}")
    return completeness_ratio >= threshold


# -----------------------------
# SLA 2: Data Timeliness
# -----------------------------
def check_timeliness(df, timestamp_column='timestamp', max_age_hours=24):
    """
    SLA: All rows must have timestamps within the last `max_age_hours`.
    """
    try:
        now = datetime.now()
        max_allowed_time = now - timedelta(hours=max_age_hours)
        recent_data_ratio = (df[timestamp_column] >= max_allowed_time).mean()
        print(f"[Timeliness Check] % data within {max_age_hours} hours: {recent_data_ratio:.2%}")
        return recent_data_ratio == 1.0  # SLA: all must be timely
    except Exception as e:
        print(f"Timeliness check error: {e}")
        return False


# -----------------------------
# SLA 3: Data Consistency
# -----------------------------
def check_consistency(df_main, df_ref, key='id', threshold=0.99):
    """
    SLA: At least `threshold`% of the foreign key values in `df_main` must exist in `df_ref`.
    """
    try:
        match_ratio = df_main[key].isin(df_ref[key]).mean()
        print(f"[Consistency Check] Key match ratio: {match_ratio:.2%}")
        return match_ratio >= threshold
    except Exception as e:
        print(f"Consistency check error: {e}")
        return False


# -----------------------------
# Run All SLA Checks
# -----------------------------
print("=== Data Quality SLA Checks ===")

sla_completeness = check_completeness(data_main)
sla_timeliness = check_timeliness(data_main, 'timestamp')
sla_consistency = check_consistency(data_main, data_ref, 'id')

print("\n=== SLA Results ===")
print(f"✅ Completeness SLA Passed? {'Yes' if sla_completeness else 'No'}")
print(f"✅ Timeliness SLA Passed? {'Yes' if sla_timeliness else 'No'}")
print(f"✅ Consistency SLA Passed? {'Yes' if sla_consistency else 'No'}")

=== Data Quality SLA Checks ===
[Completeness Check] Non-null ratio: 93.33%
[Timeliness Check] % data within 24 hours: 80.00%
[Consistency Check] Key match ratio: 80.00%

=== SLA Results ===
✅ Completeness SLA Passed? No
✅ Timeliness SLA Passed? No
✅ Consistency SLA Passed? No


### Data Timeliness:
**Description**: Establish an SLA that specifies that data should be integrated and processed within 24 hours of acquisition. Monitor the data pipeline for timeliness.

In [5]:
# write your code from here

### Data Consistency:
**Description**: Define an SLA for maintaining consistency across various related datasets. Implement a check to ensure that 99% of data entries are consistent.

In [6]:
# write your code from here