<a href="https://colab.research.google.com/github/nmansour67/skills-introduction-to-github/blob/main/First_Causal_Inference_Model_step4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# ==========================================================
# THE MASTER RWE SCRIPT: FROM BIAS TO TRUTH
# ==========================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors

# --- PART 1: GENERATE THE "CONFOUNDED" DATASET ---
# This creates 1,000 patients where sicker patients are more likely to get the AI tool.
np.random.seed(42)
n_patients = 1000

# Generate Patient Features
age = np.random.normal(65, 10, n_patients)
severity = np.random.normal(5, 2, n_patients) # Sickness score

# Create Selection Bias: Sicker/Older patients get the treatment
prob_treatment = 1 / (1 + np.exp(-(0.1 * age + 0.5 * severity - 10)))
treatment = np.random.binomial(1, prob_treatment)

# Create Outcome: The AI reduces mortality (Coefficient -0.8)
mortality_risk = 1 / (1 + np.exp(-(0.05 * age + 0.3 * severity - 0.8 * treatment - 5)))
outcome = np.random.binomial(1, mortality_risk)

# Put it all in a DataFrame
df = pd.DataFrame({
    'Age': age,
    'Severity': severity,
    'Treated_with_AI': treatment,
    'Mortality': outcome
})

print(f"Data Generated: {n_patients} patients.")
print("-" * 30)

# --- PART 2: THE NAIVE ANALYSIS (THE TRAP) ---
# This shows the misleading raw numbers.
mortality_treated = df[df['Treated_with_AI']==1]['Mortality'].mean()
mortality_untreated = df[df['Treated_with_AI']==0]['Mortality'].mean()

print(f"Naive Mortality (Treated):   {mortality_treated*100:.1f}%")
print(f"Naive Mortality (Untreated): {mortality_untreated*100:.1f}%")
print("(CONCLUSION: It looks like the AI hurts patients!)")
print("-" * 30)

# --- PART 3: PROPENSITY SCORE MATCHING (THE FIX) ---
# 1. Calculate Propensity Scores
ps_model = LogisticRegression()
ps_model.fit(df[['Age', 'Severity']], df['Treated_with_AI'])
df['Propensity_Score'] = ps_model.predict_proba(df[['Age', 'Severity']])[:, 1]

# 2. Perform Nearest Neighbor Matching
treated_group = df[df['Treated_with_AI'] == 1]
control_group = df[df['Treated_with_AI'] == 0]

# Find the "Twin" for every treated patient
nbrs = NearestNeighbors(n_neighbors=1).fit(control_group[['Propensity_Score']])
distances, indices = nbrs.kneighbors(treated_group[['Propensity_Score']])

# Create the Matched Control Group
matched_control = control_group.iloc[indices.flatten()]

# Combine into a final "Fair" cohort
matched_cohort = pd.concat([treated_group, matched_control])

# --- PART 4: THE TRUE ANALYSIS (THE REVELATION) ---
mortality_treated_adj = matched_cohort[matched_cohort['Treated_with_AI']==1]['Mortality'].mean()
mortality_control_adj = matched_cohort[matched_cohort['Treated_with_AI']==0]['Mortality'].mean()

print(f"Matched Mortality (Treated): {mortality_treated_adj*100:.1f}%")
print(f"Matched Mortality (Control): {mortality_control_adj*100:.1f}%")

# Calculate Relative Risk Reduction
reduction = (mortality_control_adj - mortality_treated_adj) / mortality_control_adj

print("-" * 30)
print(f"TRUE AI BENEFIT: {reduction*100:.1f}% Relative Risk Reduction")
print("-" * 30)
print("LESSON: The naive data lied. The matched data proves the AI works.")

Data Generated: 1000 patients.
------------------------------
Naive Mortality (Treated):   42.2%
Naive Mortality (Untreated): 38.6%
(CONCLUSION: It looks like the AI hurts patients!)
------------------------------
Matched Mortality (Treated): 42.2%
Matched Mortality (Control): 62.0%
------------------------------
TRUE AI BENEFIT: 32.0% Relative Risk Reduction
------------------------------
LESSON: The naive data lied. The matched data proves the AI works.
