
# Titanic Survival Predictor (Notebook)

This notebook trains a **Titanic survival** classifier using a resilient data-loading pattern similar to your example:
- Try multiple public sources (with/without headers)
- Normalize column names to a standard schema
- Save a local cached CSV under `data/titanic.csv` for reproducibility

It then performs cleaning, feature engineering, modeling (Logistic Regression & Random Forest), evaluation, feature importance, and model export.


In [None]:

# Core imports
from pathlib import Path
import io
from urllib.request import urlopen

import numpy as np
import pandas as pd

# Modeling & pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, RocCurveDisplay

# Saving model
import joblib

# Plotting
import matplotlib.pyplot as plt

print("Versions →", {
    "numpy": np.__version__,
    "pandas": pd.__version__,
})


## Data loading with multiple sources and header handling

In [None]:

# Two public sources with slightly different schemas.
# (Kaggle originals require auth; these mirrors are open.)
SOURCES = [
    # Datasciencedojo version (has many columns incl. Name, Ticket, Cabin, Embarked)
    "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv",
    # Plotly version (columns slightly different; e.g., 'Siblings/Spouses Aboard')
    "https://raw.githubusercontent.com/plotly/datasets/master/titanic.csv",
]

# We'll standardize to these columns when possible.
COLUMNS = ["Survived","Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]

data_path = Path('data/titanic.csv')
data_path.parent.mkdir(parents=True, exist_ok=True)

# Helper: unify schema across variants
def normalize_titanic_columns(df: pd.DataFrame) -> pd.DataFrame:
    # Work on a copy
    df = df.copy()

    # Common alternative column names
    rename_map = {
        'Siblings/Spouses Aboard': 'SibSp',
        'Parents/Children Aboard': 'Parch',
        'siblings/spouses aboard': 'SibSp',
        'parents/children aboard': 'Parch',
        'pclass': 'Pclass',
        'sex': 'Sex',
        'age': 'Age',
        'sibsp': 'SibSp',
        'parch': 'Parch',
        'fare': 'Fare',
        'embarked': 'Embarked',
        'survived': 'Survived',
    }

    # Lowercase -> attempt rename, then titlecase back where applicable
    lower_cols = {c: c.lower() for c in df.columns}
    df.columns = [lower_cols[c] for c in df.columns]
    df.rename(columns=rename_map, inplace=True)

    # Fix titlecase for known set
    df.rename(columns={
        'pclass': 'Pclass',
        'sex': 'Sex',
        'age': 'Age',
        'sibsp': 'SibSp',
        'parch': 'Parch',
        'fare': 'Fare',
        'embarked': 'Embarked',
        'survived': 'Survived',
    }, inplace=True)

    # If 'PassengerId' is present, we don't need it
    drop_candidates = ["PassengerId", "passengerid", "Name", "name", "Ticket", "ticket", "Cabin", "cabin"]
    for col in drop_candidates:
        if col in df.columns:
            df.drop(columns=[col], inplace=True)

    # If the Plotly version lacks Embarked, create it as NaN (will be imputed later)
    import numpy as _np
    if 'Embarked' not in df.columns:
        df['Embarked'] = _np.nan

    # Ensure columns exist; if some are missing, add as NaN
    for col in COLUMNS:
        if col not in df.columns:
            df[col] = _np.nan

    # Finally, keep only our standard set (order matters)
    df = df[COLUMNS]
    return df

def try_download():
    for url in SOURCES:
        try:
            print(f'Trying {url} ...')
            with urlopen(url, timeout=20) as resp:
                text = resp.read().decode('utf-8')
            # Try to read as CSV directly
            df_raw = pd.read_csv(io.StringIO(text))
            # If it came without headers (unlikely for these sources), synthesize generic headers
            if list(df_raw.columns) == list(range(df_raw.shape[1])):
                # Make a best-effort guess — we will just assign plausible headers and normalize
                guessed = df_raw
                guessed.columns = [f"col_{i}" for i in range(df_raw.shape[1])]
                df = normalize_titanic_columns(guessed)
            else:
                df = normalize_titanic_columns(df_raw)

            # Save local copy
            df.to_csv(data_path, index=False)
            print('Downloaded and saved to', data_path.resolve())
            return df
        except Exception as e:
            print('  Failed:', e)
    return None

# Load local or download
if data_path.exists():
    df = pd.read_csv(data_path)
    print('Loaded local data from', data_path.resolve())
else:
    df = try_download()
    if df is None:
        raise RuntimeError(
            "Could not download dataset. Please place a file 'data/titanic.csv' locally "
            "with columns: " + ", ".join(COLUMNS)
        )

df.head()


## Quick EDA

In [None]:

display(df.describe(include='all').T)
print("\nMissing values per column:\n", df.isna().sum())


## Train/test split and preprocessing

In [None]:

# Target and features
target = "Survived"
X = df.drop(columns=[target])
y = df[target].astype(int)  # ensure binary ints 0/1

# Simple train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

numeric_features = ["Age", "Fare", "Pclass", "SibSp", "Parch"]
categorical_features = ["Sex", "Embarked"]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Two candidate models
log_reg = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", LogisticRegression(max_iter=1000))
])

rf = Pipeline(steps=[
    ("preprocess", preprocess),
    ("clf", RandomForestClassifier(
        n_estimators=300, random_state=42, class_weight=None
    ))
])

# Cross-validate both
for name, model in [("LogisticRegression", log_reg), ("RandomForest", rf)]:
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring="roc_auc")
    print(f"{name} ROC-AUC (CV): {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")


## Train final model and evaluate

In [None]:

# Fit both and pick the better on validation ROC-AUC quickly
log_reg.fit(X_train, y_train)
rf.fit(X_train, y_train)

def eval_model(pipe, label):
    y_pred = pipe.predict(X_test)
    y_proba = pipe.predict_proba(X_test)[:, 1] if hasattr(pipe, "predict_proba") else None
    print(f"\n=== {label} ===")
    print(classification_report(y_test, y_pred, digits=3))
    if y_proba is not None:
        auc = roc_auc_score(y_test, y_proba)
        print(f"ROC-AUC: {auc:.3f}")
        RocCurveDisplay.from_predictions(y_test, y_proba)
        plt.title(f"ROC Curve — {label}")
        plt.show()

    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.title(f"Confusion Matrix — {label}")
    plt.show()

eval_model(log_reg, "Logistic Regression")
eval_model(rf, "Random Forest")

# Choose best based on test ROC-AUC (fallback: accuracy if proba missing)
def get_auc(pipe):
    proba = pipe.predict_proba(X_test)[:,1]
    return roc_auc_score(y_test, proba)

auc_log = get_auc(log_reg)
auc_rf  = get_auc(rf)

best_model, best_name = (rf, "RandomForest") if auc_rf >= auc_log else (log_reg, "LogisticRegression")
print(f"\nBest model selected: {best_name} (AUC: {max(auc_rf, auc_log):.3f})")


## Feature importance / coefficients

In [None]:

# Extract feature names after preprocessing
preprocessor = best_model.named_steps["preprocess"]

num_feats = preprocessor.transformers_[0][2]
cat_feats = list(preprocessor.named_transformers_["cat"].named_steps["onehot"].get_feature_names_out(categorical_features))
all_features = list(num_feats) + cat_feats

if best_name == "LogisticRegression":
    coefs = best_model.named_steps["clf"].coef_[0]
    imp = pd.Series(coefs, index=all_features).sort_values(key=np.abs, ascending=False)
    imp.head(20).plot(kind="barh")
    plt.title("Top |coefficients| — Logistic Regression")
    plt.gca().invert_yaxis()
    plt.show()
else:
    importances = best_model.named_steps["clf"].feature_importances_
    imp = pd.Series(importances, index=all_features).sort_values(ascending=True)
    imp.tail(20).plot(kind="barh")
    plt.title("Top Feature Importances — Random Forest")
    plt.show()

imp_df = imp.sort_values(ascending=False).reset_index()
imp_df.columns = ["feature","importance"]
display(imp_df.head(25))


## Save trained model

In [None]:

models_dir = Path("models")
models_dir.mkdir(parents=True, exist_ok=True)
model_path = models_dir / f"titanic_{best_name.lower()}.joblib"
joblib.dump(best_model, model_path)
print("Saved model to:", model_path.resolve())


## Inference example

In [None]:

# Example passenger(s): list of dicts
example = pd.DataFrame([
    {"Pclass": 3, "Sex": "male",   "Age": 22.0, "SibSp": 1, "Parch": 0, "Fare": 7.25,  "Embarked": "S"},
    {"Pclass": 1, "Sex": "female", "Age": 38.0, "SibSp": 1, "Parch": 0, "Fare": 71.28, "Embarked": "C"},
])

loaded = joblib.load(model_path)
pred_proba = loaded.predict_proba(example)[:,1]
pred = loaded.predict(example)
out = example.copy()
out["Survival_Prob"] = pred_proba
out["Survival_Pred"] = pred
display(out)


## Reproducibility

In [None]:

import sys, sklearn
print({
    "python": sys.version,
    "numpy": np.__version__,
    "pandas": pd.__version__,
    "scikit_learn": sklearn.__version__,
})
