# Train Titanic models (converted from train_titanic.py)

This notebook is a cell-by-cell conversion of `train_titanic.py`. It removes the CLI (`argparse`) bits and exposes parameters as variables so you can run, iterate, and inspect intermediate results interactively.


In [None]:
# Imports and seed
import os
import re
import time
import csv
from datetime import datetime
import random

import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

SEED = 42


def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)


In [None]:
# Data loading and preprocessing (copied from script)
def load_data(path):
    df = pd.read_csv(path)
    return df


def extract_title(name):
    m = re.search(r",\s*([^\.]+)\.", name)
    if m:
        return m.group(1).strip()
    return "Unknown"


def preprocess(df, fit_objects=None):
    # work on a copy
    df = df.copy()

    # Drop columns with too many missing values or not useful
    for c in ["Cabin", "Ticket"]:
        if c in df.columns:
            df.drop(columns=[c], inplace=True)

    # Extract Title from Name
    if "Name" in df.columns:
        df["Title"] = df["Name"].apply(extract_title)
        # simplify titles
        df["Title"] = df["Title"].replace(["Mlle", "Ms"], "Miss")
        df["Title"] = df["Title"].replace(["Mme"], "Mrs")
        rare_titles = [
            "Lady",
            "Countess",
            "Capt",
            "Col",
            "Don",
            "Dr",
            "Major",
            "Rev",
            "Sir",
            "Jonkheer",
            "Dona",
        ]
        df["Title"] = df["Title"].apply(lambda x: "Rare" if x in rare_titles else x)
        df.drop(columns=["Name"], inplace=True)

    # Fill Embarked
    if "Embarked" in df.columns:
        if fit_objects and "embarked_mode" in fit_objects:
            mode = fit_objects["embarked_mode"]
        else:
            mode = df["Embarked"].mode().iloc[0]
        df["Embarked"] = df["Embarked"].fillna(mode)

    # Fill Fare
    if "Fare" in df.columns:
        if fit_objects and "fare_median" in fit_objects:
            fare_med = fit_objects["fare_median"]
        else:
            fare_med = df["Fare"].median()
        df["Fare"] = df["Fare"].fillna(fare_med)

    # Fill Age using median by Title when possible
    if "Age" in df.columns:
        if fit_objects and "age_medians" in fit_objects:
            age_map = fit_objects["age_medians"]
            df["Age"] = df.apply(
                lambda r: (
                    age_map.get(r["Title"], age_map.get("overall"))
                    if pd.isnull(r["Age"])
                    else r["Age"]
                ),
                axis=1,
            )
        else:
            age_medians = df.groupby("Title")["Age"].median().to_dict()
            overall = df["Age"].median()
            age_medians["overall"] = overall
            df["Age"] = df.apply(
                lambda r: (
                    age_medians.get(r["Title"], overall)
                    if pd.isnull(r["Age"])
                    else r["Age"]
                ),
                axis=1,
            )

    # Sex encoding
    if "Sex" in df.columns:
        df["Sex"] = df["Sex"].map({"male": 0, "female": 1}).astype(int)

    # One-hot Embarked and Title
    cats = []
    if "Embarked" in df.columns:
        cats += ["Embarked"]
    if "Title" in df.columns:
        cats += ["Title"]

    df = pd.get_dummies(df, columns=cats, drop_first=True)

    # Drop PassengerId when present but keep index for submissions
    if "PassengerId" in df.columns:
        pid = df["PassengerId"]
        df.drop(columns=["PassengerId"], inplace=True)
    else:
        pid = None

    # Select features
    feature_cols = [c for c in df.columns if c != "Survived"]

    # Scale numeric features
    num_cols = [c for c in ["Age", "SibSp", "Parch", "Fare"] if c in df.columns]
    scaler = None
    if fit_objects and "scaler" in fit_objects:
        scaler = fit_objects["scaler"]
        df[num_cols] = scaler.transform(df[num_cols])
    else:
        scaler = StandardScaler()
        if num_cols:
            df[num_cols] = scaler.fit_transform(df[num_cols])

    fit_objects_out = {"scaler": scaler, "feature_columns": feature_cols}
    return df, pid, fit_objects_out


In [None]:
# Training and logging functions
def write_log(path, row):
    header = [
        "timestamp",
        "seed",
        "model",
        "accuracy",
        "f1",
        "roc_auc",
        "runtime_s",
        "artifact",
    ]
    exists = os.path.exists(path)
    with open(path, "a", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=header)
        if not exists:
            writer.writeheader()
        writer.writerow(row)


def train_and_log(
    train_csv_path, models_dir="models", log_path="training_log.csv", quick=False
):
    set_seed()
    os.makedirs(models_dir, exist_ok=True)

    df = load_data(train_csv_path)
    if "Survived" not in df.columns:
        raise ValueError("train data must contain Survived column")

    # initial preprocess to compute fill values
    df_proc, pid, fit_objs = preprocess(df)

    X = df_proc.drop(columns=["Survived"])
    y = df_proc["Survived"]

    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=SEED
    )

    # Models to train
    models = {
        "logreg": LogisticRegression(max_iter=500, random_state=SEED),
        "rf": RandomForestClassifier(
            n_estimators=100 if not quick else 10, random_state=SEED
        ),
        "gb": GradientBoostingClassifier(random_state=SEED),
        "svc": SVC(probability=True, random_state=SEED),
    }

    # optional xgboost
    try:
        from xgboost import XGBClassifier

        models["xgb"] = XGBClassifier(
            use_label_encoder=False, eval_metric="logloss", random_state=SEED
        )
    except Exception:
        pass

    results = []
    for name, model in models.items():
        t0 = time.time()
        model.fit(X_train, y_train)
        dt = time.time() - t0
        preds = model.predict(X_val)
        proba = (
            model.predict_proba(X_val)[:, 1]
            if hasattr(model, "predict_proba")
            else None
        )

        acc = accuracy_score(y_val, preds)
        f1 = f1_score(y_val, preds)
        roc = roc_auc_score(y_val, proba) if proba is not None else float("nan")

        model_artifact = os.path.join(models_dir, f"{name}.joblib")
        # save model and preprocessing objects
        joblib.dump(
            {
                "model": model,
                "scaler": fit_objs["scaler"],
                "features": fit_objs["feature_columns"],
            },
            model_artifact,
        )

        # Append to CSV log
        row = {
            "timestamp": datetime.utcnow().isoformat(),
            "seed": SEED,
            "model": name,
            "accuracy": acc,
            "f1": f1,
            "roc_auc": roc,
            "runtime_s": round(dt, 3),
            "artifact": model_artifact,
        }
        results.append(row)

        write_log(log_path, row)
        print(
            f"Trained {name}: acc={acc:.4f}, f1={f1:.4f}, roc_auc={roc:.4f}, time={dt:.2f}s"
        )

    return results


In [None]:
# Parameters and dry-run example
DATA_PATH = os.path.join("EDA", "train.csv")
MODELS_DIR = "models"
LOG_PATH = "training_log.csv"
QUICK = True  # set to False for full training

print("Using seed:", SEED)
# Dry run: load and preprocess only
df = load_data(DATA_PATH)
proc, pid, fit = preprocess(df)
print("Loaded", DATA_PATH)
print("After preprocess, shape:", proc.shape)
print("Feature columns (sample):", fit["feature_columns"][:10])


Using seed: 42
Loaded EDA\train.csv
After preprocess, shape: (891, 14)
Feature columns (sample): ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_Q', 'Embarked_S', 'Title_Miss', 'Title_Mr']


In [None]:
# Run training (this will train several scikit-learn baselines)
results = train_and_log(
    DATA_PATH, models_dir=MODELS_DIR, log_path=LOG_PATH, quick=QUICK
)
import pandas as pd

pd.DataFrame(results)


Trained logreg: acc=0.8156, f1=0.7519, roc_auc=0.8746, time=0.01s
Trained rf: acc=0.7989, f1=0.7273, roc_auc=0.8170, time=0.01s
Trained gb: acc=0.8101, f1=0.7258, roc_auc=0.8528, time=0.07s
Trained svc: acc=0.8380, f1=0.7820, roc_auc=0.8466, time=0.04s
Trained svc: acc=0.8380, f1=0.7820, roc_auc=0.8466, time=0.04s


  "timestamp": datetime.utcnow().isoformat(),
  "timestamp": datetime.utcnow().isoformat(),
  "timestamp": datetime.utcnow().isoformat(),
  "timestamp": datetime.utcnow().isoformat(),


Unnamed: 0,timestamp,seed,model,accuracy,f1,roc_auc,runtime_s,artifact
0,2025-10-27T09:41:45.336004,42,logreg,0.815642,0.75188,0.874572,0.009,models\logreg.joblib
1,2025-10-27T09:41:45.354798,42,rf,0.798883,0.727273,0.816996,0.01,models\rf.joblib
2,2025-10-27T09:41:45.434350,42,gb,0.810056,0.725806,0.852767,0.072,models\gb.joblib
3,2025-10-27T09:41:45.484308,42,svc,0.837989,0.781955,0.84664,0.04,models\svc.joblib


In [7]:
# Inspect saved model artifacts and load one model to test
import glob

# ensure models dir exists and is a directory
if not os.path.exists(MODELS_DIR) or not os.path.isdir(MODELS_DIR):
    print(f"Models directory '{MODELS_DIR}' does not exist or is not a directory.")
    artifacts = []
else:
    # search for joblib files inside models dir
    artifacts = glob.glob(os.path.join(MODELS_DIR, "*.joblib"))

artifacts

if artifacts:
    sample = joblib.load(artifacts[0])
    print("Loaded artifact keys:", list(sample.keys()))
    # example: run inference on first 5 rows of validation-like data
    df_small = proc.drop(columns=["Survived"]).head(5)
    # ensure numeric columns scaled using saved scaler
    num_cols = [c for c in ["Age", "SibSp", "Parch", "Fare"] if c in df_small.columns]
    if num_cols:
        df_small[num_cols] = sample["scaler"].transform(df_small[num_cols])
    X_small = df_small[sample["features"]]
    preds = sample["model"].predict(X_small)
    print("Sample predictions:", preds)


Loaded artifact keys: ['model', 'scaler', 'features']
Sample predictions: [0 1 0 0 0]


Notes:

- The notebook keeps the original preprocessing and training logic.
- Interactive iteration tips: set `QUICK=False` for real training; increase `n_estimators` in the RandomForest model if desired.
- If you want to re-create the exact CLI behavior, we can add a cell that parses `sys.argv` or use `argparse`-like helpers for notebooks.
