In [21]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from typing import List, Dict, Any
import pytest

In [22]:
# generating sample insurance claims data w/ intentional quality issues
np.random.seed(42)

n_records = 100

In [23]:
# lets generate dates
start_date = datetime(2023, 1, 1)
claim_dates = [start_date + timedelta(days=int(x)) for x in np.random.randint(0, 365, n_records)]

In [24]:
# creating sample data w/ some intentional issues for testing

data = {
    'claim_id': [f'CLM{str(i).zfill(6)}' for i in range(1, n_records + 1)],
    'policy_id': [f'POL{str(i).zfill(5)}' for i in np.random.randint(1000, 9999, n_records)],
    'claim_date': claim_dates,
    'claim_amount': np.random.uniform(100, 50000, n_records),
    'claim_type': np.random.choice(['Life', 'Disability', 'Accidental Death'], n_records),
    'policy_holder_age': np.random.randint(18, 85, n_records),
    'claim_status': np.random.choice(['Pending', 'Approved', 'Denied', 'Under Review'], n_records)
}

df = pd.DataFrame(data)

In [None]:
# introduce some data quality issues for testing to catch just to make it fun:

# a. missing values
df.loc[5, 'policy_id'] = None
df.loc[12, 'claim_amount'] = None

# b. invalid amounts (negative or extremely high)
df.loc[8, 'claim_amount'] = -500
df.loc[15, 'claim_amount'] = 2000000  # suspiciously high here

# c. future dates
df.loc[20, 'claim_date'] = datetime.now() + timedelta(days=30)

# d. invalid age
df.loc[25, 'policy_holder_age'] = 150

# e. duplicate claim IDs
df.loc[30, 'claim_id'] = df.loc[10, 'claim_id']

In [26]:
# save to CSV
df.to_csv('sample_claims_data.csv', index=False)
print("Sample claims data generated: sample_claims_data.csv")
print(f"Total records: {len(df)}")
print("\nIntentional data quality issues introduced:")
print("- 2 missing critical values")
print("- 1 negative claim amount")
print("- 1 suspiciously high claim amount")
print("- 1 future claim date")
print("- 1 invalid age")
print("- 1 duplicate claim ID")

Sample claims data generated: sample_claims_data.csv
Total records: 100

Intentional data quality issues introduced:
- 2 missing critical values
- 1 negative claim amount
- 1 suspiciously high claim amount
- 1 future claim date
- 1 invalid age
- 1 duplicate claim ID


# now we can test our framework

In [None]:
class ClaimsDataValidator:
    """
    Validates insurance claims data for quality, consistency, and business rules.
    """
    
    def __init__(self, filepath: str):
        """Initialize validator with claims data"""
        self.df = pd.read_csv(filepath, parse_dates=['claim_date'])
        self.validation_results = []
    
    def run_all_validations(self) -> Dict[str, Any]:
        """Execute all validation tests and return summary"""
        tests = [
            self.test_no_missing_critical_fields,
            self.test_no_duplicate_claim_ids,
            self.test_claim_amount_validity,
            self.test_date_validity,
            self.test_age_validity,
            self.test_claim_type_validity,
            self.test_referential_integrity
        ]
        
        results = {
            'total_records': len(self.df),
            'tests_run': len(tests),
            'tests_passed': 0,
            'tests_failed': 0,
            'issues_found': []
        }
        
        for test in tests:
            try:
                test()
                results['tests_passed'] += 1
            except AssertionError as e:
                results['tests_failed'] += 1
                results['issues_found'].append({
                    'test': test.__name__,
                    'error': str(e)
                })
        
        return results
    
    def test_no_missing_critical_fields(self):
        """Critical fields must never be null"""
        critical_fields = ['claim_id', 'policy_id', 'claim_amount', 'claim_date']
        
        for field in critical_fields:
            missing_count = self.df[field].isna().sum()
            assert missing_count == 0, \
                f"Found {missing_count} missing values in critical field '{field}'"
    
    def test_no_duplicate_claim_ids(self):
        """Claim IDs must be unique"""
        duplicates = self.df[self.df.duplicated(subset=['claim_id'], keep=False)]
        assert len(duplicates) == 0, \
            f"Found {len(duplicates)} duplicate claim IDs: {duplicates['claim_id'].unique().tolist()}"
    
    def test_claim_amount_validity(self):
        """Claim amounts must be positive and within reasonable range"""
        # check for negative amounts
        negative = self.df[self.df['claim_amount'] < 0]
        assert len(negative) == 0, \
            f"Found {len(negative)} negative claim amounts"
        
        # check for suspiciously high amounts (over $1M for life insurance)
        # this threshold would be configurable based on business 
        high_threshold = 1_000_000
        suspicious = self.df[self.df['claim_amount'] > high_threshold]
        assert len(suspicious) == 0, \
            f"Found {len(suspicious)} claims exceeding ${high_threshold:,} - requires review"
    
    def test_date_validity(self):
        """Claim dates must be valid and not in the future"""
        today = pd.Timestamp(datetime.now().date())
        
        # check future dates
        future_dates = self.df[self.df['claim_date'] > today]
        assert len(future_dates) == 0, \
            f"Found {len(future_dates)} claims with future dates"
        
        # check unreasonably old claims (e.g., >10 years)
        ten_years_ago = today - pd.Timedelta(days=3650)
        old_claims = self.df[self.df['claim_date'] < ten_years_ago]
        if len(old_claims) > 0:
            print(f"Warning: {len(old_claims)} claims older than 10 years")
    
    def test_age_validity(self):
        """Policy holder age must be realistic"""
        invalid_ages = self.df[
            (self.df['policy_holder_age'] < 0) | 
            (self.df['policy_holder_age'] > 120)
        ]
        assert len(invalid_ages) == 0, \
            f"Found {len(invalid_ages)} records with invalid ages"
    
    def test_claim_type_validity(self):
        """Claim type must be from approved list"""
        valid_types = ['Life', 'Disability', 'Accidental Death']
        invalid = self.df[~self.df['claim_type'].isin(valid_types)]
        assert len(invalid) == 0, \
            f"Found {len(invalid)} claims with invalid claim types"
    
    def test_referential_integrity(self):
        """Policy IDs should follow expected format"""
        # in real scenario, would validate against policy database
        # here we check format: POL + 5 digits
        invalid_format = self.df[~self.df['policy_id'].astype(str).str.match(r'^POL\d{5}$')]
        assert len(invalid_format) == 0, \
            f"Found {len(invalid_format)} policy IDs with invalid format"
    
    def generate_report(self) -> str:
        """Generate human-readable validation report"""
        results = self.run_all_validations()
        
        report = f"""

     INSURANCE CLAIMS DATA VALIDATION REPORT                

Dataset: {results['total_records']} records processed
Tests Run: {results['tests_run']}
✓ Passed: {results['tests_passed']}
✗ Failed: {results['tests_failed']}

"""
        if results['issues_found']:
            report += "ISSUES DETECTED:\n"
            report += "─" * 60 + "\n"
            for idx, issue in enumerate(results['issues_found'], 1):
                report += f"{idx}. {issue['test']}\n"
                report += f"   Error: {issue['error']}\n\n"
        else:
            report += "✓ All validation tests passed!\n"
        
        report += "═" * 60 + "\n"
        return report


# pytest test cases 
@pytest.fixture
def validator():
    """Fixture to create validator instance"""
    return ClaimsDataValidator('sample_claims_data.csv')


def test_critical_fields_complete(validator):
    """Test that critical fields have no missing values"""
    validator.test_no_missing_critical_fields()


def test_unique_claim_ids(validator):
    """Test that all claim IDs are unique"""
    validator.test_no_duplicate_claim_ids()


def test_valid_claim_amounts(validator):
    """Test that claim amounts are valid"""
    validator.test_claim_amount_validity()


def test_valid_dates(validator):
    """Test that dates are valid and not in future"""
    validator.test_date_validity()


def test_valid_ages(validator):
    """Test that ages are within realistic range"""
    validator.test_age_validity()


def test_valid_claim_types(validator):
    """Test that claim types are from approved list"""
    validator.test_claim_type_validity()


def test_policy_id_format(validator):
    """Test that policy IDs follow expected format"""
    validator.test_referential_integrity()


# main execution here
if __name__ == "__main__":
    print("Running Insurance Claims Data Validation...\n")
    
    validator = ClaimsDataValidator('sample_claims_data.csv')
    report = validator.generate_report()
    print(report)
    
    print("\nTo run with pytest framework:")
    print("  pytest test_claims_validator.py -v")
    

Running Insurance Claims Data Validation...



     INSURANCE CLAIMS DATA VALIDATION REPORT                

Dataset: 100 records processed
Tests Run: 7
✓ Passed: 1
✗ Failed: 6

ISSUES DETECTED:
────────────────────────────────────────────────────────────
1. test_no_missing_critical_fields
   Error: Found 1 missing values in critical field 'policy_id'

2. test_no_duplicate_claim_ids
   Error: Found 2 duplicate claim IDs: ['CLM000011']

3. test_claim_amount_validity
   Error: Found 1 negative claim amounts

4. test_date_validity
   Error: Found 1 claims with future dates

5. test_age_validity
   Error: Found 1 records with invalid ages

6. test_referential_integrity
   Error: Found 1 policy IDs with invalid format

════════════════════════════════════════════════════════════


To run with pytest framework:
  pytest test_claims_validator.py -v
