# 03 â€” Modeling

**Objective:** Train and compare multiple models with cross-validation and hyperparameter tuning.

## 1. Load Data and Train/Test Split

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("../data/processed/churn_features.csv")
if "Churn" not in df.columns and df.shape[1] > 0:
    df = pd.read_csv("../data/raw/churn-bigml-20_raw.csv")
    from sklearn.preprocessing import LabelEncoder
    df["International plan"] = (df["International plan"] == "Yes").astype(int)
    df["Voice mail plan"] = (df["Voice mail plan"] == "Yes").astype(int)
    df = pd.get_dummies(df, columns=["State"], drop_first=True)
    df["Churn"] = df["Churn"].astype(int)
    X = df.drop(columns=["Churn"]).select_dtypes(include=[np.number])
    y = df["Churn"]
    X = pd.DataFrame(StandardScaler().fit_transform(X), columns=X.columns)
else:
    X = df.drop(columns=["Churn"])
    y = df["Churn"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Train:", X_train.shape, "Test:", X_test.shape)

## 2. Model Training (Multiple Models)

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    cv_score = cross_val_score(model, X_train, y_train, cv=5, scoring="roc_auc")
    results[name] = {"model": model, "cv_auc_mean": cv_score.mean(), "cv_auc_std": cv_score.std()}
    print(f"{name}: CV AUC = {cv_score.mean():.3f} (+/- {cv_score.std():.3f})")

## 3. Cross-Validation Summary

In [None]:
cv_summary = pd.DataFrame([{"Model": k, "CV AUC Mean": v["cv_auc_mean"], "CV AUC Std": v["cv_auc_std"]} for k, v in results.items()])
cv_summary

## 4. Parameter Tuning (Optional)

In [None]:
from sklearn.model_selection import GridSearchCV

# Example: tune Random Forest
param_grid = {"n_estimators": [50, 100, 200], "max_depth": [5, 10, None]}
rf_cv = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring="roc_auc")
rf_cv.fit(X_train, y_train)
print("Best params:", rf_cv.best_params_)
print("Best CV AUC:", rf_cv.best_score_)

## 5. Save Models for Evaluation

In [None]:
import joblib
import os

os.makedirs("../models", exist_ok=True)
for name, data in results.items():
    fname = name.lower().replace(" ", "_") + ".pkl"
    joblib.dump(data["model"], f"../models/{fname}")
print("Models saved to models/")

## 6. Summary

- **Fair comparison:** Same train/test split, same features
- **No overfitting:** Cross-validation used; tune on train only
- **Models:** Logistic Regression, Random Forest, XGBoost