## Real-World Case Studies

### Healthcare - Medical Prediction Errors:
**Description**: Implement validation rules using a healthcare dataset to reduce errors in
predictive models by automating data quality checks.

In [1]:

# write your code from here

import pandas as pd
import numpy as np
import unittest

# ----------------------------
# Simulate Healthcare Dataset
# ----------------------------
def generate_healthcare_data(n=500, include_errors=False):
    np.random.seed(42)
    df = pd.DataFrame({
        'patient_id': range(1, n + 1),
        'age': np.random.randint(0, 100, size=n),
        'systolic_bp': np.random.randint(90, 180, size=n),  # mmHg
        'diastolic_bp': np.random.randint(60, 120, size=n),
        'bmi': np.round(np.random.normal(25, 4, size=n), 1),
        'diagnosis': np.random.choice(['Diabetes', 'Hypertension', 'None'], size=n),
        'readmitted': np.random.choice([0, 1], size=n, p=[0.85, 0.15])
    })

    if include_errors:
        # Add invalid ages
        df.loc[0:5, 'age'] = [-5, 130, 999, -1, 200, None]
        # Add invalid blood pressures
        df.loc[6:8, 'systolic_bp'] = [300, None, 40]
        # Add invalid BMI
        df.loc[9:10, 'bmi'] = [-10, 80]
        # Add missing diagnosis
        df.loc[11:13, 'diagnosis'] = None
        # Add invalid readmission labels
        df.loc[14:15, 'readmitted'] = [2, -1]
    return df

# ----------------------------
# Data Validation Rules
# ----------------------------

def validate_age(df):
    invalid = df[(df['age'].isnull()) | (df['age'] < 0) | (df['age'] > 120)]
    print(f"[Validation] Invalid Ages: {len(invalid)}")
    return len(invalid) == 0

def validate_blood_pressure(df):
    systolic_invalid = df[(df['systolic_bp'].isnull()) | (df['systolic_bp'] < 70) | (df['systolic_bp'] > 250)]
    diastolic_invalid = df[(df['diastolic_bp'].isnull()) | (df['diastolic_bp'] < 40) | (df['diastolic_bp'] > 150)]
    print(f"[Validation] Invalid Systolic BP: {len(systolic_invalid)}")
    print(f"[Validation] Invalid Diastolic BP: {len(diastolic_invalid)}")
    return len(systolic_invalid) == 0 and len(diastolic_invalid) == 0

def validate_bmi(df):
    invalid = df[(df['bmi'].isnull()) | (df['bmi'] < 10) | (df['bmi'] > 60)]
    print(f"[Validation] Invalid BMI: {len(invalid)}")
    return len(invalid) == 0

def validate_diagnosis(df):
    valid_values = {'Diabetes', 'Hypertension', 'None'}
    invalid = df[df['diagnosis'].isnull() | ~df['diagnosis'].isin(valid_values)]
    print(f"[Validation] Invalid Diagnosis Entries: {len(invalid)}")
    return len(invalid) == 0

def validate_readmitted_label(df):
    invalid = df[~df['readmitted'].isin([0, 1])]
    print(f"[Validation] Invalid Readmission Labels: {len(invalid)}")
    return len(invalid) == 0

# ----------------------------
# Master Validator
# ----------------------------

def validate_healthcare_data(df):
    results = {
        'age': validate_age(df),
        'blood_pressure': validate_blood_pressure(df),
        'bmi': validate_bmi(df),
        'diagnosis': validate_diagnosis(df),
        'readmitted': validate_readmitted_label(df),
    }

    all_passed = all(results.values())
    print("\n[Summary] All Validations Passed?" , "✅ Yes" if all_passed else "❌ No")
    return results, all_passed

# ----------------------------
# Run the Validation Pipeline
# ----------------------------

if __name__ == "__main__":
    print("=== Running Healthcare Data Validation ===\n")
    df = generate_healthcare_data(include_errors=True)
    results, all_ok = validate_healthcare_data(df)

# ----------------------------
# Unit Tests for Reliability
# ----------------------------

class TestHealthcareValidation(unittest.TestCase):

    def test_all_valid(self):
        df = generate_healthcare_data(include_errors=False)
        results, all_ok = validate_healthcare_data(df)
        self.assertTrue(all_ok)

    def test_invalid_ages(self):
        df = generate_healthcare_data(include_errors=True)
        self.assertFalse(validate_age(df))

    def test_invalid_bmi(self):
        df = generate_healthcare_data(include_errors=True)
        self.assertFalse(validate_bmi(df))

    def test_invalid_readmitted(self):
        df = generate_healthcare_data(include_errors=True)
        self.assertFalse(validate_readmitted_label(df))

if __name__ == '__main__':
    print("\n=== Running Unit Tests ===")
    unittest.main(argv=[''], exit=False)

  df.loc[0:5, 'age'] = [-5, 130, 999, -1, 200, None]
  df.loc[6:8, 'systolic_bp'] = [300, None, 40]
  df.loc[0:5, 'age'] = [-5, 130, 999, -1, 200, None]
  df.loc[6:8, 'systolic_bp'] = [300, None, 40]
  df.loc[0:5, 'age'] = [-5, 130, 999, -1, 200, None]
  df.loc[6:8, 'systolic_bp'] = [300, None, 40]
  df.loc[0:5, 'age'] = [-5, 130, 999, -1, 200, None]
  df.loc[6:8, 'systolic_bp'] = [300, None, 40]
.
----------------------------------------------------------------------
Ran 4 tests in 0.016s

OK


=== Running Healthcare Data Validation ===

[Validation] Invalid Ages: 6
[Validation] Invalid Systolic BP: 3
[Validation] Invalid Diastolic BP: 0
[Validation] Invalid BMI: 2
[Validation] Invalid Diagnosis Entries: 3
[Validation] Invalid Readmission Labels: 2

[Summary] All Validations Passed? ❌ No

=== Running Unit Tests ===
[Validation] Invalid Ages: 0
[Validation] Invalid Systolic BP: 0
[Validation] Invalid Diastolic BP: 0
[Validation] Invalid BMI: 0
[Validation] Invalid Diagnosis Entries: 0
[Validation] Invalid Readmission Labels: 0

[Summary] All Validations Passed? ✅ Yes
[Validation] Invalid Ages: 6
[Validation] Invalid BMI: 2
[Validation] Invalid Readmission Labels: 2
