In [3]:
# gradient_boosting_cabin_imputer.py
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier

# -------------------------
# 1) Load data
# -------------------------
df = pd.read_csv("../data/raw/train.csv")

# -------------------------
# 2) Target: Deck (first letter of Cabin)
# -------------------------
df["Deck"] = df["Cabin"].astype(str).str[0]
df["Deck"] = df["Deck"].replace("n", np.nan)   # from "nan" string

# -------------------------
# 3) Feature engineering
# -------------------------
# Fare transform (skew reduction)
df["logFare"] = np.log1p(df["Fare"])

# Normalize within Pclass (captures “expensive-for-your-class” signal)
df["Fare_norm"] = df.groupby("Pclass")["logFare"].transform(
    lambda x: (x - x.mean()) / (x.std(ddof=0) if x.std(ddof=0) > 0 else 1.0)
)

# Ticket prefix (letters before digits, clean punctuation)
ticket_prefix = (
    df["Ticket"]
    .astype(str)
    .str.replace(r"\d+", "", regex=True)
    .str.replace(".", "", regex=False)
    .str.strip()
)
df["Ticket_prefix"] = ticket_prefix.replace("", "NONE")

# Ticket group size (people sharing same ticket)
df["Ticket_group_size"] = df.groupby("Ticket")["Ticket"].transform("count")

# One-hot encode categorical features
X_full = pd.get_dummies(
    df[["Pclass", "Fare_norm", "Ticket_prefix", "Ticket_group_size"]],
    drop_first=True
)

# -------------------------
# 4) Drop ultra-rare deck(s) from training (e.g., T occurs once)
# -------------------------
deck_counts = df["Deck"].value_counts(dropna=True)
rare_decks = deck_counts[deck_counts < 2].index
mask_known = df["Deck"].notna() & (~df["Deck"].isin(rare_decks))

X_known = X_full[mask_known]
y_known = df.loc[mask_known, "Deck"]

# -------------------------
# 5) Train/validation split + model
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_known, y_known, test_size=0.2, random_state=42, stratify=y_known
)

gb = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

gb.fit(X_train, y_train)

# -------------------------
# 6) Evaluation
# -------------------------
y_pred = gb.predict(X_test)
print("Gradient Boosting — classification report:")
print(classification_report(y_test, y_pred, zero_division=0))
print("\nConfusion matrix:")
print(confusion_matrix(y_test, y_pred))

cv_scores = cross_val_score(gb, X_known, y_known, cv=5)
print(f"\nCV accuracy: {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")

# -------------------------
# 7) Predict decks for missing rows
# -------------------------
mask_missing = df["Deck"].isna()
X_missing = X_full[mask_missing]

predicted_decks = gb.predict(X_missing)

# -------------------------
# 8) Build Cabin_filled column
# -------------------------
df["Cabin_filled"] = df["Cabin"]
df.loc[mask_missing, "Cabin_filled"] = predicted_decks + "XXX"

# (Optional) sanity check: how many were filled?
n_filled = mask_missing.sum()
print(f"\nFilled Cabin for {n_filled} passengers using predicted Deck + 'XXX'.")

# -------------------------
# 9) Save result
# -------------------------
out_path = "train_cabin_filled_gb.csv"
df.to_csv(out_path, index=False)
print(f"Saved augmented dataset to: {out_path}")



Gradient Boosting — classification report:
              precision    recall  f1-score   support

           A       0.50      0.67      0.57         3
           B       0.83      0.56      0.67         9
           C       0.83      0.83      0.83        12
           D       0.83      0.71      0.77         7
           E       0.50      0.67      0.57         6
           F       0.75      1.00      0.86         3
           G       1.00      1.00      1.00         1

    accuracy                           0.73        41
   macro avg       0.75      0.78      0.75        41
weighted avg       0.76      0.73      0.73        41


Confusion matrix:
[[ 2  0  1  0  0  0  0]
 [ 1  5  1  1  1  0  0]
 [ 0  1 10  0  1  0  0]
 [ 0  0  0  5  2  0  0]
 [ 1  0  0  0  4  1  0]
 [ 0  0  0  0  0  3  0]
 [ 0  0  0  0  0  0  1]]

CV accuracy: 0.655 ± 0.089

Filled Cabin for 687 passengers using predicted Deck + 'XXX'.
Saved augmented dataset to: train_cabin_filled_gb.csv
