In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# --- 1. Simulation of Post-TB Lung Disease Data ---

np.random.seed(42)
n_samples = 200

data = {
    'age': np.random.randint(18, 80, n_samples),
    'bmi': np.random.normal(22, 3, n_samples),
    'distance_to_clinic_km': np.random.exponential(5, n_samples),
    'previous_tb_episodes': np.random.randint(1, 4, n_samples),
    'smoking_status': np.random.choice(['Never', 'Former', 'Current'], n_samples),
    # Target: Quality of Life Score (0-100)
    'qol_score': np.random.normal(70, 10, n_samples)
}

df = pd.DataFrame(data)

# --- 2. The Preprocessing Pipeline ---

numeric_features = ['age', 'bmi', 'distance_to_clinic_km', 'previous_tb_episodes']
categorical_features = ['smoking_status']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# --- 3. Defining the Models ---
# The "Traditional" Baseline
linear_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# The "Ensemble" Candidate (Random Forest)
rf_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# The "Ensemble" Candidate (Gradient Boosting)
gb_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(n_estimators=100, random_state=42))
])

# --- 4. Benchmarking Logic (Cross-Validation) ---
# Supports your claim of using "stratified sampling and cross-validation"
cv = KFold(n_splits=5, shuffle=True, random_state=42)

models = {'Linear Regression': linear_model, 'Random Forest': rf_model, 'Gradient Boosting': gb_model}
results = {}

print("--- Benchmarking Results (RMSE) ---")
for name, model in models.items():
    # Negative MSE is the standard scoring, so we flip it and take sqrt for RMSE
    scores = cross_val_score(model, df.drop('qol_score', axis=1), df['qol_score'], 
                             cv=cv, scoring='neg_mean_squared_error')
    rmse_scores = np.sqrt(-scores)
    results[name] = rmse_scores.mean()
    print(f"{name}: {rmse_scores.mean():.4f} (+/- {rmse_scores.std():.4f})")

# --- 5. Selection Logic ---
best_model = min(results, key=results.get)
print(f"\nConclusion: The {best_model} performed best on this synthetic cohort.")

--- Benchmarking Results (RMSE) ---
Linear Regression: 10.0208 (+/- 0.6784)
Random Forest: 10.3985 (+/- 0.9859)
Gradient Boosting: 11.0950 (+/- 0.6195)

Conclusion: The Linear Regression performed best on this synthetic cohort.
