In [9]:
# %% [markdown]
# # 03 — Modeling (Fast Version — No CV)
# Random Forest with sensible defaults (balanced class weights), ROC/PR, threshold tuning,
# and BI-ready scored outputs. No hyper-parameter search to avoid long runs.

# %%
import os, json
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    roc_curve, precision_recall_curve, ConfusionMatrixDisplay
)
from sklearn.ensemble import RandomForestClassifier
import joblib

# ---- Set your project root (edit if different)
os.chdir(r"C:\Users\dell\Desktop\Cell2Cell_Project")

DATA_RAW = Path('data/raw')
CLEANED = Path('data/cleaned'); CLEANED.mkdir(parents=True, exist_ok=True)
ART = Path('artifacts'); ART.mkdir(parents=True, exist_ok=True)
IMG = Path('report/images'); IMG.mkdir(parents=True, exist_ok=True)

# ---- Load data
train = pd.read_csv(DATA_RAW / 'cell2celltrain.csv')
y = (train['Churn'] == 'Yes').astype(int)
X = train.drop(columns=['Churn'])

# Load columns + preprocessor from 02_cleaning
meta = json.load(open(ART / 'columns.json'))
num_cols, cat_cols = meta['num_cols'], meta['cat_cols']
pre = joblib.load(ART / 'preprocessor.joblib')

# Rare-category bucketing (same as 02) to keep categories consistent
def bucket_rare_categories(df, cols, min_frac=0.01):
    df = df.copy()
    n = len(df)
    for c in cols:
        vc = df[c].value_counts(dropna=False)
        rare_levels = vc[vc < n * min_frac].index
        df[c] = df[c].where(~df[c].isin(rare_levels), 'Other')
    return df

X = bucket_rare_categories(X, cat_cols, min_frac=0.01)

# ---- Split
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ---- Fast, sensible Random Forest (no CV)
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=18,                 # cap depth to reduce overfit + speed up
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    class_weight='balanced_subsample',  # help class imbalance
    random_state=42,
    n_jobs=-1
)
pipe = Pipeline([('pre', pre), ('clf', rf)])

# ---- Train
pipe.fit(X_train, y_train)

# ---- Evaluate (default 0.50 threshold)
pred = pipe.predict(X_valid)
proba = pipe.predict_proba(X_valid)[:, 1]
metrics = {
    'model': 'RandomForest_fast',
    'accuracy': accuracy_score(y_valid, pred),
    'precision': precision_score(y_valid, pred),
    'recall': recall_score(y_valid, pred),
    'f1': f1_score(y_valid, pred),
    'roc_auc': roc_auc_score(y_valid, proba)
}
print("Validation metrics @0.50:\n", pd.DataFrame([metrics]))

# Save metrics + model
pd.DataFrame([metrics]).to_csv(ART / 'validation_metrics.csv', index=False)
joblib.dump(pipe, ART / 'best_model.joblib')

# Confusion matrix (0.50)
ConfusionMatrixDisplay.from_predictions(y_valid, pred)
plt.title('Confusion Matrix (thr=0.50)')
plt.tight_layout()
plt.savefig(IMG / 'confusion_matrix_default.png', dpi=150)
plt.close()

# ROC curve
fpr, tpr, _ = roc_curve(y_valid, proba)
plt.figure()
plt.plot(fpr, tpr); plt.plot([0,1],[0,1],'--')
plt.xlabel('FPR'); plt.ylabel('TPR'); plt.title('ROC Curve')
plt.tight_layout(); plt.savefig(IMG / 'roc_curve.png', dpi=150); plt.close()

# PR curve
prec, rec, pr_thr = precision_recall_curve(y_valid, proba)
plt.figure()
plt.plot(rec, prec)
plt.xlabel('Recall'); plt.ylabel('Precision'); plt.title('Precision-Recall Curve')
plt.tight_layout(); plt.savefig(IMG / 'pr_curve.png', dpi=150); plt.close()
print("Saved: validation_metrics.csv, best_model.joblib, confusion_matrix_default.png, roc_curve.png, pr_curve.png")

# ---- Threshold tuning
# (A) F1-optimal threshold
f1_scores = 2 * (prec * rec) / (prec + rec + 1e-12)
best_idx = np.nanargmax(f1_scores)
best_thr_f1 = pr_thr[best_idx] if best_idx < len(pr_thr) else 0.5
pred_f1 = (proba >= best_thr_f1).astype(int)

ConfusionMatrixDisplay.from_predictions(y_valid, pred_f1)
plt.title(f'Confusion Matrix (F1-optimal thr={best_thr_f1:.2f})')
plt.tight_layout(); plt.savefig(IMG / 'confusion_matrix_f1.png', dpi=150); plt.close()

# (B) Business threshold: top 20% highest risk
thr_top20 = float(np.quantile(proba, 0.80))
pred_top20 = (proba >= thr_top20).astype(int)

ConfusionMatrixDisplay.from_predictions(y_valid, pred_top20)
plt.title(f'Confusion Matrix (top-20% thr≈{thr_top20:.2f})')
plt.tight_layout(); plt.savefig(IMG / 'confusion_matrix_top20.png', dpi=150); plt.close()

with open(ART / 'thresholds.json','w') as f:
    json.dump({'f1_optimal': float(best_thr_f1), 'top20': float(thr_top20)}, f)

print(f"Saved thresholds: f1={best_thr_f1:.3f}, top20≈{thr_top20:.3f}")

# ---- BI-friendly scored outputs
valid_scores = pd.DataFrame({'proba': proba, 'actual': y_valid.reset_index(drop=True)})
valid_scores['decile'] = pd.qcut(valid_scores['proba'], 10, labels=False, duplicates='drop')
valid_scores.to_csv(CLEANED / 'validation_scored_with_deciles.csv', index=False)

# Score HOLDOUT with same preprocessing + rare-bucket
holdout = pd.read_csv(DATA_RAW / 'cell2cellholdout.csv')
holdout = bucket_rare_categories(holdout, cat_cols, min_frac=0.01)
holdout_proba = pipe.predict_proba(holdout)[:, 1]
out = holdout.copy()
out['churn_probability'] = holdout_proba
out['decile'] = pd.qcut(out['churn_probability'], 10, labels=False, duplicates='drop')
out.to_csv(CLEANED / 'holdout_scored.csv', index=False)

print("Saved: data/cleaned/validation_scored_with_deciles.csv, data/cleaned/holdout_scored.csv, artifacts/thresholds.json")


Validation metrics @0.50:
                model  accuracy  precision    recall       f1   roc_auc
0  RandomForest_fast  0.688149   0.441827  0.312373  0.36599  0.658235
Saved: validation_metrics.csv, best_model.joblib, confusion_matrix_default.png, roc_curve.png, pr_curve.png
Saved thresholds: f1=0.379, top20≈0.501
Saved: data/cleaned/validation_scored_with_deciles.csv, data/cleaned/holdout_scored.csv, artifacts/thresholds.json
