In [None]:
# Notebook: automated_data_quality_checks_realistic.ipynb
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Generate realistic claims dataset
num_claims = 25000
claim_ids = [f"C{200000+i}" for i in range(num_claims)]
patient_ids = [f"P{100000+i}" for i in range(10000)]
claim_patient_ids = np.random.choice(patient_ids, num_claims)
procedure_codes = np.random.choice(['PROC001','PROC002','PROC003','PROC004','PROC005'], num_claims)
claim_amounts = np.round(np.random.uniform(-50, 20000, num_claims), 2)  # Include some negative for anomaly
claim_dates = [datetime.today() - timedelta(days=random.randint(0, 1825)) for _ in range(num_claims)]
provider_ids = [f"PR{random.randint(100,399)}" for _ in range(num_claims)]

claims_df = pd.DataFrame({
    'claim_id': claim_ids,
    'patient_id': claim_patient_ids,
    'procedure_code': procedure_codes,
    'claim_amount': claim_amounts,
    'provider_id': provider_ids,
    'claim_date': claim_dates
})

# Automated Quality Check Function
def run_quality_checks(df):
    report = {}
    report['missing_values'] = df.isnull().sum().to_dict()
    report['duplicate_rows'] = df.duplicated().sum()
    report['negative_claim_amounts'] = df[df['claim_amount'] < 0].shape[0]
    report['claims_above_15000'] = df[df['claim_amount'] > 15000].shape[0]
    return report

quality_report = run_quality_checks(claims_df)
print("--- Automated Claims Data Quality Report ---")
for k,v in quality_report.items():
    print(f"{k}: {v}")


--- Automated Claims Data Quality Report ---
missing_values: {'claim_id': 0, 'patient_id': 0, 'procedure_code': 0, 'claim_amount': 0, 'provider_id': 0, 'claim_date': 0}
duplicate_rows: 0
negative_claim_amounts: 47
claims_above_15000: 6182
