# Model Building

In [4]:
import os, json, re
from datetime import datetime
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import joblib

# Optional MLflow
try:
    import mlflow; import mlflow.sklearn
    MLFLOW_AVAILABLE = True
except Exception:
    MLFLOW_AVAILABLE = False

DATA_PATH = r"/cleaned_data.csv"
MODEL_PATH = r"/mnt/data/best_model.pkl"
REPORT_PATH = r"/mnt/data/performance_report.txt"
METRICS_PATH = r"/mnt/data/metrics.json"
EXPERIMENT_NAME = "bank_churn_lightweight"

def find_target_column(cols):
    # Prefer exact 'Exited', else any column that ends with 'Exited' (e.g., 'num__Exited')
    if "Exited" in cols:
        return "Exited"
    candidates = [c for c in cols if c.lower().endswith("exited")]
    if not candidates:
        raise ValueError("Target column not found. Expecting 'Exited' or a column ending with 'Exited' (e.g., 'num__Exited').")
    # Pick the shortest name if multiple
    return sorted(candidates, key=len)[0]

def main():
    df = pd.read_csv(DATA_PATH)
    tgt = find_target_column(df.columns.tolist())

    y = (df[tgt] > 0).astype(int)  # handle standardized targets like num__Exited
    X = df.drop(columns=[tgt])

    # Models without extra preprocessing (data already cleaned)
    log_reg = LogisticRegression(max_iter=1000)
    rf = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)

    models = {"logistic_regression": log_reg, "random_forest": rf}

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    results = {}
    best_name, best_f1, best_model = None, -1.0, None

    if MLFLOW_AVAILABLE:
        mlflow.set_experiment(EXPERIMENT_NAME)

    for name, model in models.items():
        if MLFLOW_AVAILABLE:
            run = mlflow.start_run(run_name=name)
        else:
            run = None
        try:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            metrics = dict(
                accuracy=accuracy_score(y_test, y_pred),
                precision=precision_score(y_test, y_pred, zero_division=0),
                recall=recall_score(y_test, y_pred, zero_division=0),
                f1=f1_score(y_test, y_pred, zero_division=0)
            )
            cls_rep = classification_report(y_test, y_pred, digits=4)
            results[name] = dict(**metrics, classification_report=cls_rep)

            if MLFLOW_AVAILABLE:
                mlflow.log_param("model_type", name)
                for k,v in metrics.items():
                    mlflow.log_metric(k, float(v))

            if metrics["f1"] > best_f1:
                best_f1, best_name, best_model = metrics["f1"], name, model
        finally:
            if MLFLOW_AVAILABLE and run is not None:
                mlflow.end_run()

    # Create the directory if it doesn't exist
    output_dir = os.path.dirname(MODEL_PATH)
    os.makedirs(output_dir, exist_ok=True)

    # Save best model
    joblib.dump(best_model, MODEL_PATH)

    # Report
    lines = []
    lines.append(f"Experiment: {EXPERIMENT_NAME}")
    lines.append(f"Timestamp (UTC): {datetime.utcnow().isoformat()}")
    lines.append(f"Detected target column: {tgt}")
    lines.append("")
    for name, m in results.items():
        lines.append(f"=== {name} ===")
        lines.append(f"Accuracy:  {m['accuracy']:.4f}")
        lines.append(f"Precision: {m['precision']:.4f}")
        lines.append(f"Recall:    {m['recall']:.4f}")
        lines.append(f"F1:        {m['f1']:.4f}")
        lines.append("Classification Report:")
        lines.append(m["classification_report"])
        lines.append("")
    lines.append(f"Best model: {best_name} (by F1 = {best_f1:.4f})")

    with open(REPORT_PATH, "w", encoding="utf-8") as f:
        f.write("\\n".join(lines))

    with open(METRICS_PATH, "w", encoding="utf-8") as f:
        json.dump({"results": results, "best_model": best_name, "best_f1": best_f1}, f, indent=2)

    if MLFLOW_AVAILABLE:
        mlflow.start_run(run_name="artifacts_and_best_model")
        try:
            mlflow.log_artifact(REPORT_PATH)
            mlflow.log_artifact(METRICS_PATH)
            mlflow.log_artifact(MODEL_PATH)
        finally:
            mlflow.end_run()

    print("Training complete.")
    print("Best model:", best_name, "F1=", round(best_f1, 4))
    print("Saved model =>", MODEL_PATH)
    print("Report =>", REPORT_PATH)
    print("Metrics JSON =>", METRICS_PATH)

if __name__ == "__main__":
    main()

Training complete.
Best model: random_forest F1= 0.4708
Saved model => /mnt/data/best_model.pkl
Report => /mnt/data/performance_report.txt
Metrics JSON => /mnt/data/metrics.json


  lines.append(f"Timestamp (UTC): {datetime.utcnow().isoformat()}")


In [None]:
from google.colab import drive
drive.mount('/content/drive')