# Titanic Survival Prediction

## 1. Import Libraries and Load Data

First, let's import the necessary libraries and load our training and testing datasets.

In [1]:
# 1) IMPORTS & LOAD
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier

RANDOM_STATE = 42
N_SPLITS = 5  # mets 3 pour itérer vite, 5 pour le run final

DATA_DIR = Path("data")
train_df = pd.read_csv(DATA_DIR/"train.csv")
test_df  = pd.read_csv(DATA_DIR/"test.csv")
print("Train/Test:", train_df.shape, test_df.shape)
assert train_df.shape[0]==891 and test_df.shape[0]==418, "Fichiers Titanic inattendus."


Train/Test: (891, 12) (418, 11)




## 2. Exploratory Data Analysis (EDA)

Now, let's explore the data to understand its structure, find patterns, and identify missing values.

In [2]:
# 2) HELPERS: features avancées + target encoding anti-fuite

def extract_title(name: pd.Series) -> pd.Series:
    t = name.str.extract(r',\s*([^.]+)\.', expand=False).fillna('Unknown')
    map_title = {'Mlle':'Miss','Ms':'Miss','Mme':'Mrs','Lady':'Rare','Countess':'Rare','Sir':'Rare',
                 'Jonkheer':'Rare','Don':'Rare','Dona':'Rare','Capt':'Rare','Col':'Rare','Dr':'Rare',
                 'Major':'Rare','Rev':'Rare'}
    t = t.replace(map_title)
    return t.where(t.isin(['Mr','Mrs','Miss','Master']), 'Rare')

def split_ticket(t: str):
    t = str(t).strip()
    if " " in t:
        prefix, number = t.rsplit(" ", 1)
    else:
        prefix, number = "NONE", t
    return prefix, number

def make_features(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    # Groupes
    out['Surname']    = out['Name'].str.split(',').str[0].str.strip()
    out['FamilySize'] = out['SibSp'].fillna(0) + out['Parch'].fillna(0) + 1
    out['FamilyID']   = (out['Surname'] + "_" + out['FamilySize'].astype(int).astype(str)).astype(str)

    # Titre
    out['Title'] = extract_title(out['Name'])

    # Ticket prefix/number & group size
    tp, tn = zip(*out['Ticket'].apply(split_ticket))
    out['Ticket_prefix'] = list(tp)
    out['Ticket_number'] = list(tn)
    out['Ticket_clean']  = out['Ticket'].astype(str).str.replace(r'[^A-Za-z0-9]', '', regex=True)
    counts = out['Ticket_clean'].value_counts()
    out['TicketGroupSize'] = out['Ticket_clean'].map(counts).fillna(1).clip(upper=6).astype(int)

    # Cabines
    out['HasCabin']  = (~out['Cabin'].isna()).astype(int)
    out['CabinDeck'] = out['Cabin'].astype(str).str[0]
    out['CabinDeck'] = out['CabinDeck'].where(out['CabinDeck'].isin(list('ABCDEFGT')), 'U')

    # Embarked
    if out['Embarked'].isna().any():
        out['Embarked'] = out['Embarked'].fillna(out['Embarked'].mode().iloc[0])

    # Fare & ratios
    out['Fare'] = out['Fare'].replace(0, np.nan)
    out['Fare'] = out['Fare'].fillna(out.groupby('Pclass')['Fare'].transform('median'))
    out['IsAlone'] = (out['FamilySize']==1).astype(int)
    out['FarePerPerson'] = (out['Fare'] / out['FamilySize']).replace([np.inf,-np.inf], np.nan)
    out['FarePerPerson'] = out['FarePerPerson'].fillna(out['Fare'].median())

    # Age imput (par Title/Pclass/Sex)
    out['Age'] = out['Age'].fillna(out.groupby(['Title','Pclass','Sex'])['Age'].transform('median'))

    # Bins
    out['AgeBin']  = pd.qcut(out['Age'].clip(0,80), 6, duplicates='drop').cat.codes
    out['FareBin'] = pd.qcut(out['Fare'].clip(0,250), 6, duplicates='drop').cat.codes

    # Interactions
    out['Sex_is_male'] = (out['Sex']=='male').astype(int)
    out['IsChild']     = (out['Age'] < 14).astype(int)
    out['SexxPclass']  = out['Sex_is_male'] * out['Pclass']
    out['SexxAge']     = out['Sex_is_male'] * out['Age']
    out['FarePPxP']    = out['FarePerPerson'] * out['Pclass']

    return out

def kfold_target_encode(train, test, col, target='Survived', n_splits=5, smoothing=20, seed=42):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    global_mean = train[target].mean()
    oof = pd.Series(index=train.index, dtype=float)

    for tr_idx, va_idx in skf.split(train, train[target]):
        tr, va = train.iloc[tr_idx], train.iloc[va_idx]
        stats = tr.groupby(col)[target].agg(['mean','count'])
        stats['enc'] = (stats['mean']*stats['count'] + global_mean*smoothing) / (stats['count'] + smoothing)
        oof.iloc[va_idx] = va[col].map(stats['enc']).fillna(global_mean)

    stats_full = train.groupby(col)[target].agg(['mean','count'])
    stats_full['enc'] = (stats_full['mean']*stats_full['count'] + global_mean*smoothing) / (stats['count'] + smoothing)
    test_enc = test[col].map(stats_full['enc']).fillna(global_mean)

    return oof.values.astype(float), test_enc.values.astype(float)


## 3. Data Cleaning & Feature Engineering

Based on our EDA, we'll clean the data by handling missing values and create new features to improve our model's performance.

In [8]:
# 3) BUILD MATRICES ++ (features métier + TE K-Fold étendu + one-hot propre)

# --- features enrichies
def make_features(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    # Groupes
    out['Surname']    = out['Name'].str.split(',').str[0].str.strip()
    out['FamilySize'] = out['SibSp'].fillna(0) + out['Parch'].fillna(0) + 1
    out['FamilyID']   = (out['Surname'] + "_" + out['FamilySize'].astype(int).astype(str)).astype(str)

    # Titre
    def extract_title(name: pd.Series) -> pd.Series:
        t = name.str.extract(r',\s*([^.]+)\.', expand=False).fillna('Unknown')
        map_title = {'Mlle':'Miss','Ms':'Miss','Mme':'Mrs','Lady':'Rare','Countess':'Rare','Sir':'Rare',
                     'Jonkheer':'Rare','Don':'Rare','Dona':'Rare','Capt':'Rare','Col':'Rare','Dr':'Rare',
                     'Major':'Rare','Rev':'Rare'}
        t = t.replace(map_title)
        return t.where(t.isin(['Mr','Mrs','Miss','Master']), 'Rare')
    out['Title'] = extract_title(out['Name'])

    # Ticket prefix/number & clean & group size
    def split_ticket(t: str):
        t = str(t).strip()
        if " " in t:
            prefix, number = t.rsplit(" ", 1)
        else:
            prefix, number = "NONE", t
        return prefix, number
    tp, tn = zip(*out['Ticket'].apply(split_ticket))
    out['Ticket_prefix'] = list(tp)
    out['Ticket_number'] = list(tn)
    out['Ticket_clean']  = out['Ticket'].astype(str).str.replace(r'[^A-Za-z0-9]', '', regex=True)
    counts = out['Ticket_clean'].value_counts()
    out['TicketGroupSize'] = out['Ticket_clean'].map(counts).fillna(1).clip(upper=6).astype(int)

    # Cabines
    out['HasCabin']  = (~out['Cabin'].isna()).astype(int)
    out['CabinDeck'] = out['Cabin'].astype(str).str[0]
    out['CabinDeck'] = out['CabinDeck'].where(out['CabinDeck'].isin(list('ABCDEFGT')), 'U')
    # combien de cabines listées (utile pour 1ère classe)
    out['CabinCount'] = out['Cabin'].astype(str).str.split().apply(lambda x: 0 if (len(x)==1 and x[0]=='nan') else len(x))

    # Embarked
    if out['Embarked'].isna().any():
        out['Embarked'] = out['Embarked'].fillna(out['Embarked'].mode().iloc[0])

    # Fare & ratios
    out['Fare'] = out['Fare'].replace(0, np.nan)
    out['Fare'] = out['Fare'].fillna(out.groupby('Pclass')['Fare'].transform('median'))
    out['IsAlone'] = (out['FamilySize']==1).astype(int)
    out['FarePerPerson'] = (out['Fare'] / out['FamilySize']).replace([np.inf,-np.inf], np.nan)
    out['FarePerPerson'] = out['FarePerPerson'].fillna(out['Fare'].median())

    # Age imput (par Title/Pclass/Sex)
    out['Age'] = out['Age'].fillna(out.groupby(['Title','Pclass','Sex'])['Age'].transform('median'))

    # Bins
    out['AgeBin']  = pd.qcut(out['Age'].clip(0,80), 6, duplicates='drop').cat.codes
    out['FareBin'] = pd.qcut(out['Fare'].clip(0,250), 6, duplicates='drop').cat.codes

    # Interactions fortes + features métier
    out['Sex_is_male'] = (out['Sex']=='male').astype(int)
    out['IsChild']     = (out['Age'] < 14).astype(int)
    out['IsMother']    = ((out['Sex']=='female') & (out['Parch']>0) & (out['Age']>=18) & (out['Title']!='Miss')).astype(int)
    out['SexxPclass']  = out['Sex_is_male'] * out['Pclass']
    out['SexxAge']     = out['Sex_is_male'] * out['Age']
    out['FarePPxP']    = out['FarePerPerson'] * out['Pclass']

    # Longueurs utiles (souvent corrélées)
    out['NameLen']     = out['Name'].astype(str).str.len()
    out['NameWords']   = out['Name'].astype(str).str.split().str.len()
    out['TicketNumLen']= out['Ticket_number'].astype(str).str.len()

    # GroupKey pour TE (famille/classe/port)
    out['GroupKey'] = out['Surname'].astype(str) + "_" + out['Pclass'].astype(str) + "_" + out['Embarked'].astype(str)

    return out

train_f = make_features(train_df)
test_f  = make_features(test_df)

# --- Target encoding (anti-fuite) sur cardinalités utiles
def kfold_target_encode(train, test, col, target='Survived', n_splits=5, smoothing=20, seed=42):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    global_mean = train[target].mean()
    oof = pd.Series(index=train.index, dtype=float)
    for tr_idx, va_idx in skf.split(train, train[target]):
        tr, va = train.iloc[tr_idx], train.iloc[va_idx]
        stats = tr.groupby(col)[target].agg(['mean','count'])
        stats['enc'] = (stats['mean']*stats['count'] + global_mean*smoothing) / (stats['count'] + smoothing)
        oof.iloc[va_idx] = va[col].map(stats['enc']).fillna(global_mean)
    stats_full = train.groupby(col)[target].agg(['mean','count'])
    stats_full['enc'] = (stats_full['mean']*stats_full['count'] + global_mean*smoothing) / (stats_full['count'] + smoothing)
    test_enc = test[col].map(stats_full['enc']).fillna(global_mean)
    return oof.values.astype(float), test_enc.values.astype(float)

te_cols = ['FamilyID','Surname','Ticket_clean','GroupKey','Ticket_prefix']  # élargi
for c in te_cols:
    tr_val, te_val = kfold_target_encode(train_f, test_f, c, target='Survived',
                                         n_splits=N_SPLITS, smoothing=20, seed=RANDOM_STATE)
    train_f[f'TE_{c}'] = tr_val
    test_f[f'TE_{c}']  = te_val

# --- Drop colonnes brutes texte/IDs
drop_cols = [
    'Name','Ticket','Cabin','Ticket_clean','Surname','FamilyID',
    'Ticket_prefix','Ticket_number','GroupKey'
]
base_cols_train = [c for c in train_f.columns if c not in drop_cols]
base_cols_test  = [c for c in test_f.columns  if c not in drop_cols]
common_cols = sorted(set(base_cols_train).intersection(base_cols_test))

# --- One-hot (inclure CabinDeck)
categoricals = [c for c in ['Sex','Embarked','Title','Pclass','CabinDeck'] if c in common_cols]

train_part = train_f[common_cols + ['Survived']].assign(_is_train=1)
test_part  = test_f[common_cols].assign(_is_train=0)

full = pd.concat([train_part, test_part], ignore_index=True)
X_full = pd.get_dummies(full, columns=categoricals, drop_first=True)

# --- Split matrices finales
train_m = X_full[X_full['_is_train']==1].drop(columns=['_is_train'])
test_m  = X_full[X_full['_is_train']==0].drop(columns=['_is_train','Survived'])

y        = train_m['Survived'].astype(int).values
X_df     = train_m.drop(columns=['Survived','PassengerId']).astype(float)
X_testdf = test_m.drop(columns=['PassengerId']).astype(float)
pid_test = test_m['PassengerId'].astype(int).values

print("X:", X_df.shape, " | X_test:", X_testdf.shape, " | y:", y.shape)
print("NA ->", int(X_df.isna().sum().sum()), "/", int(X_testdf.isna().sum().sum()))


X: (891, 43)  | X_test: (418, 43)  | y: (891,)
NA -> 0 / 0


## 4. Model Training and Evaluation

It's time to choose a model, train it on our processed data, and see how well it performs.

In [9]:
# 4) OOF + ENSEMBLE (HGB deep + HGB régularisé + ExtraTrees + GB + LogReg) + GRID POIDS + SEUIL

import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

models = {
    'hgb_deep': HistGradientBoostingClassifier(
        learning_rate=0.055, max_iter=1200, max_depth=None,
        l2_regularization=0.0, random_state=RANDOM_STATE
    ),
    'hgb_reg': HistGradientBoostingClassifier(
        learning_rate=0.045, max_iter=1600, max_depth=4,
        l2_regularization=0.0, random_state=RANDOM_STATE+1
    ),
    'et': ExtraTreesClassifier(
        n_estimators=1200, max_features=0.5, min_samples_leaf=2,
        random_state=RANDOM_STATE+2, n_jobs=-1
    ),
    'gb': GradientBoostingClassifier(
        n_estimators=500, learning_rate=0.06, max_depth=3,
        subsample=0.9, random_state=RANDOM_STATE+3
    ),
    'lr': Pipeline([
        ('scaler', StandardScaler(with_mean=False)),
        ('clf', LogisticRegression(max_iter=4000))
    ]),
}

def oof_proba(model, X, y, cv):
    return cross_val_predict(model, X, y, cv=cv, method='predict_proba')[:,1]

oof = {}
for name, mdl in models.items():
    p = oof_proba(mdl, X_df, y, cv)
    acc = accuracy_score(y, (p>=0.5).astype(int))
    oof[name] = p
    print(f"{name:>8s}  OOF acc@0.5: {acc:.4f}")

# Blending: grille un peu plus large + seuil
names = list(oof.keys())
P = np.column_stack([oof[n] for n in names])

best = {'acc':-1, 'w':None, 'th':0.5}
weight_grid = [0.15, 0.25, 0.35, 0.5, 0.75]
from itertools import product
for w in product(weight_grid, repeat=P.shape[1]):
    w = np.array(w, float)
    if w.sum()==0: 
        continue
    w = w / w.sum()
    blend = (P * w).sum(axis=1)
    for th in np.linspace(0.35, 0.65, 61):
        acc = accuracy_score(y, (blend >= th).astype(int))
        if acc > best['acc']:
            best = {'acc':acc, 'w':w.copy(), 'th':float(th)}

print("\nBEST OOF acc:", round(best['acc'],4),
      "| th:", round(best['th'],3),
      "| weights:", {n:round(w,3) for n,w in zip(names, best['w'])})


hgb_deep  OOF acc@0.5: 0.8272
 hgb_reg  OOF acc@0.5: 0.8238
      et  OOF acc@0.5: 0.8373
      gb  OOF acc@0.5: 0.8451
      lr  OOF acc@0.5: 0.8373

BEST OOF acc: 0.8586 | th: 0.565 | weights: {'hgb_deep': np.float64(0.122), 'hgb_reg': np.float64(0.073), 'et': np.float64(0.366), 'gb': np.float64(0.366), 'lr': np.float64(0.073)}


## 5. Create Submission File

Finally, we'll use our trained model to make predictions on the test set and generate the submission file in the required format.

In [10]:
# 5) FIT FINAL + SUBMISSION CSV

from sklearn.base import clone
import pandas as pd

# Fit final de chaque modèle sur tout X_df, y
fitted = {}
for n, m in models.items():
    mdl = clone(m).fit(X_df, y)
    fitted[n] = mdl

# Probas test + blend + seuil optimisé
probas_test = np.column_stack([fitted[n].predict_proba(X_testdf)[:,1] for n in names])
blend_test  = (probas_test * best['w']).sum(axis=1)
preds_test  = (blend_test >= best['th']).astype(int)

submission = pd.DataFrame({
    'PassengerId': pid_test,
    'Survived': preds_test.astype(int)
}).sort_values('PassengerId')

# Sanity checks
assert submission.shape[0]==418
assert list(submission.columns)==['PassengerId','Survived']
assert submission['Survived'].isin([0,1]).all()

submission.to_csv('submission.csv', index=False)
print("✅ Wrote submission.csv — shape", submission.shape,
      "| th:", round(best['th'],3),
      "| weights:", {n:round(w,3) for n,w in zip(names, best['w'])})

submission.head()


✅ Wrote submission.csv — shape (418, 2) | th: 0.565 | weights: {'hgb_deep': np.float64(0.122), 'hgb_reg': np.float64(0.073), 'et': np.float64(0.366), 'gb': np.float64(0.366), 'lr': np.float64(0.073)}


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
