# Titanic — Model Training Notebook

This notebook loads the Titanic dataset (from `../data/dataset.csv`), performs basic EDA, trains multiple models, compares them with cross-validation, selects the best model, and saves it to `../model.pkl`.


In [None]:
import os, json, joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

DATA_PATH = '../data/dataset.csv'
TARGET_COL = 'Survived'
CAT_COLS = ['Sex', 'Pclass', 'Embarked']
NUM_COLS = ['Age', 'SibSp', 'Parch', 'Fare']

assert os.path.exists(DATA_PATH), f'Missing data at {DATA_PATH}. Put Titanic CSV there.'
df = pd.read_csv(DATA_PATH)
df.head()

In [None]:
# Basic EDA
print(df.shape)
df.info()
df.describe(include='all')
sns.countplot(data=df, x='Sex', hue='Survived')
plt.title('Survival by Sex')
plt.show()
sns.countplot(data=df, x='Pclass', hue='Survived')
plt.title('Survival by Pclass')
plt.show()
sns.histplot(df, x='Age', hue='Survived', kde=True, bins=30)
plt.title('Age Distribution by Survival')
plt.show()

In [None]:
# Preprocess + models
numeric_tf = Pipeline([
    ('impute', SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])
categorical_tf = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
pre = ColumnTransformer([
    ('num', numeric_tf, NUM_COLS),
    ('cat', categorical_tf, CAT_COLS)
])

models = {
    'LogisticRegression': Pipeline([('pre', pre), ('clf', LogisticRegression(max_iter=1000))]),
    'RandomForest': Pipeline([('pre', pre), ('clf', RandomForestClassifier(n_estimators=300, random_state=42))]),
    'SVM': Pipeline([('pre', pre), ('clf', SVC(probability=True, kernel='rbf', C=1.0, gamma='scale', random_state=42))])
}

X = df[CAT_COLS + NUM_COLS]
y = df[TARGET_COL].astype(int)

scores = {name: float(np.mean(cross_val_score(pipe, X, y, cv=5, scoring='accuracy'))) for name, pipe in models.items()}
scores

In [None]:
best_name = max(scores, key=scores.get)
best_model = models[best_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)
acc, cm, report

In [None]:
# Save model to project root
joblib.dump(best_model, '../model.pkl')
with open('../artifacts/metrics.json', 'w', encoding='utf-8') as f:
    json.dump({'cv_scores': scores, 'best_model': best_name, 'holdout_accuracy': float(acc), 'confusion_matrix': cm.tolist()}, f, indent=2)
print('Saved model to ../model.pkl')