In [None]:
# Notebook: schema_change_impact_analysis_realistic.ipynb
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Use realistic EMR data
num_patients = 10000
patient_ids = [f"P{100000+i}" for i in range(num_patients)]
ages = np.random.randint(0, 90, num_patients)
diagnosis_codes = np.random.choice(['D01','D02','D03','D04','D05'], num_patients)
medications = np.random.choice(['MedA','MedB','MedC','MedD','MedE'], num_patients)
visit_dates = [datetime.today() - timedelta(days=random.randint(0, 1825)) for _ in range(num_patients)]

emr_df = pd.DataFrame({
    'patient_id': patient_ids,
    'age': ages,
    'diagnosis_code': diagnosis_codes,
    'medication': medications,
    'visit_date': visit_dates
})

print("Original EMR Columns:", emr_df.columns.tolist())

# Simulate schema changes
# 1. Rename column
emr_renamed = emr_df.rename(columns={'age':'patient_age'})

# 2. Remove a column
emr_missing = emr_df.drop(columns=['medication'])

# 3. Add a new column
emr_added = emr_df.copy()
emr_added['insurance_status'] = np.random.choice(['Active','Inactive'], num_patients)

# Schema impact function
def schema_impact(original, modified):
    missing_cols = set(original.columns) - set(modified.columns)
    extra_cols = set(modified.columns) - set(original.columns)
    print("Columns missing after change:", missing_cols)
    print("Extra columns after change:", extra_cols)

print("\nImpact after renaming 'age':")
schema_impact(emr_df, emr_renamed)

print("\nImpact after removing 'medication':")
schema_impact(emr_df, emr_missing)

print("\nImpact after adding 'insurance_status':")
schema_impact(emr_df, emr_added)


Original EMR Columns: ['patient_id', 'age', 'diagnosis_code', 'medication', 'visit_date']

Impact after renaming 'age':
Columns missing after change: {'age'}
Extra columns after change: {'patient_age'}

Impact after removing 'medication':
Columns missing after change: {'medication'}
Extra columns after change: set()

Impact after adding 'insurance_status':
Columns missing after change: set()
Extra columns after change: {'insurance_status'}
