In [8]:
# =========================================================
# Titanic — CatBoost (ordred target stats) > 0.80
# =========================================================
import numpy as np, pandas as pd
from pathlib import Path
from warnings import filterwarnings
filterwarnings("ignore")

# 0) Install catboost if missing
try:
    from catboost import CatBoostClassifier, Pool
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "catboost"])
    from catboost import CatBoostClassifier, Pool

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# -----------------------------
# 1) Load
# -----------------------------
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")
test_ids = test["PassengerId"].copy()

# -----------------------------
# 2) Feature engineering (symétrique, sans fuite)
# -----------------------------
def fill_age_groupwise(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    out["Age"] = out.groupby(["Pclass","Sex"])["Age"].transform(lambda s: s.fillna(s.median()))
    out["Age"] = out["Age"].fillna(out["Age"].median())
    return out

def surname(x: str) -> str:
    return str(x).split(",")[0].strip()

def ticket_prefix(x: str) -> str:
    s = ''.join([c if not c.isdigit() else '' for c in str(x)]).replace('.','').replace('/','').replace(' ','').strip()
    return s if s else "NONE"

def deck_letter(x: str) -> str:
    if pd.isna(x) or not str(x): return "U"
    return str(x).split()[0][0]

def make_features(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    # safe fills
    out["Embarked"] = out["Embarked"].fillna(out["Embarked"].mode()[0])
    out["Fare"] = out["Fare"].fillna(out["Fare"].median())

    # name / title / surname
    out["Surname"] = out["Name"].apply(surname)
    out["Title"]   = out["Name"].str.extract(r" ([A-Za-z]+)\.", expand=False)
    title_map = {'Mlle':'Miss','Ms':'Miss','Mme':'Mrs','Major':'Rare','Col':'Rare','Sir':'Rare',
                 'Don':'Rare','Lady':'Rare','Countess':'Rare','Jonkheer':'Rare','Dona':'Rare',
                 'Capt':'Rare','Rev':'Rare','Dr':'Rare'}
    out["Title"] = out["Title"].replace(title_map)

    # family
    out["FamilySize"] = out["SibSp"] + out["Parch"] + 1
    out["IsAlone"]    = (out["FamilySize"] == 1).astype(int)
    out["IsChild"]    = (out["Age"] < 16).astype(int)
    out["IsMother"]   = ((out["Sex"]=="female") & (out["Parch"]>0) &
                         (out["Title"].isin(["Mrs","Lady","Countess"])) & (out["Age"]>=18)).astype(int)

    # ticket / cabin
    out["TicketPrefix"]   = out["Ticket"].apply(ticket_prefix)
    out["Deck"]           = out["Cabin"].apply(deck_letter)
    out["TicketGroupSize"]= out["Ticket"].map(out["Ticket"].value_counts()).astype(int)

    # transforms & interactions
    out["FarePerPerson"] = out["Fare"] / out["FamilySize"]
    out["Fare_log"] = np.log1p(out["Fare"])
    out["FPP_log"]  = np.log1p(out["FarePerPerson"])
    out["AgeClass"] = out["Age"] * out["Pclass"]

    # fixed bins (juste pour info catégorielle)
    out["AgeGroup"] = pd.cut(out["Age"], [0,12,18,35,60,100],
                             labels=["Child","Teen","Adult","Middle","Senior"], include_lowest=True)
    return out

# apply
train = fill_age_groupwise(train)
test  = fill_age_groupwise(test)
train = make_features(train)
test  = make_features(test)

# Harmoniser les titres rares (basé TRAIN uniquement)
title_counts = train["Title"].value_counts()
rare = set(title_counts[title_counts < 10].index)
train.loc[train["Title"].isin(rare), "Title"] = train.loc[train["Title"].isin(rare), "Sex"].map({"male":"Mr","female":"Mrs"})
test.loc[test["Title"].isin(rare),  "Title"] = test.loc[test["Title"].isin(rare),  "Sex"].map({"male":"Mr","female":"Mrs"})

# -----------------------------
# 3) Colonnes pour CatBoost
# -----------------------------
TARGET = "Survived"
FEATURES = [
    # numériques
    "Pclass","Age","SibSp","Parch","Fare","FarePerPerson","Fare_log","FPP_log",
    "FamilySize","IsAlone","IsChild","IsMother","TicketGroupSize","AgeClass",
    # catégorielles (CatBoost les gère nativement)
    "Sex","Embarked","AgeGroup","Title","Ticket","TicketPrefix","Surname","Deck"
]
X = train[FEATURES].copy()
y = train[TARGET].astype(int).copy()
X_test = test[FEATURES].copy()

# indices des colonnes catégorielles pour CatBoost
cat_cols = [X.columns.get_loc(c) for c in ["Sex","Embarked","AgeGroup","Title","Ticket","TicketPrefix","Surname","Deck"]]

# -----------------------------
# 4) Split + Pool + entraînement avec early stopping
# -----------------------------
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE)

train_pool = Pool(X_tr, y_tr, cat_features=cat_cols)
val_pool   = Pool(X_val, y_val, cat_features=cat_cols)

model = CatBoostClassifier(
    random_seed=RANDOM_STATE,
    loss_function="Logloss",
    eval_metric="Accuracy",
    iterations=5000,
    depth=6,
    learning_rate=0.03,
    l2_leaf_reg=3.0,
    border_count=254,
    bagging_temperature=0.5,
    random_strength=1.5,
    rsm=0.8,                      # subsample features
    od_type="Iter",
    od_wait=200,                  # early stopping patience
    verbose=False
)
model.fit(train_pool, eval_set=val_pool, use_best_model=True)

# -----------------------------
# 5) Seuil optimisé pour ACCURACY (Kaggle)
# -----------------------------
val_pred_proba = model.predict_proba(val_pool)[:,1]
best_thr, best_acc = 0.5, 0.0
for t in np.linspace(0.35, 0.70, 71):
    acc = ( (val_pred_proba >= t).astype(int) == y_val.values ).mean()
    if acc > best_acc:
        best_acc, best_thr = acc, t
print(f"Validation Accuracy max={best_acc:.4f} at threshold={best_thr:.2f} | Best iters={model.tree_count_}")

# -----------------------------
# 6) Refit sur TOUT le train avec nb d’itérations optimal
# -----------------------------
full_pool = Pool(X, y, cat_features=cat_cols)
final_model = CatBoostClassifier(
    random_seed=RANDOM_STATE,
    loss_function="Logloss",
    eval_metric="Accuracy",
    iterations=model.tree_count_,  # utilise le meilleur nombre d'arbres trouvé
    depth=6,
    learning_rate=0.03,
    l2_leaf_reg=3.0,
    border_count=254,
    bagging_temperature=0.5,
    random_strength=1.5,
    rsm=0.8,
    verbose=False
)
final_model.fit(full_pool)

# -----------------------------
# 7) Prédictions test + soumission
# -----------------------------
test_pool = Pool(X_test, cat_features=cat_cols)
proba_test = final_model.predict_proba(test_pool)[:,1]
pred_test  = (proba_test >= best_thr).astype(int)

submission = pd.DataFrame({"PassengerId": test_ids, "Survived": pred_test})
assert set(submission.columns)=={"PassengerId","Survived"} and submission["Survived"].isin([0,1]).all()
submission.to_csv("submission.csv", index=False)
print("Saved submission.csv")


Validation Accuracy max=0.8268 at threshold=0.50 | Best iters=83
Saved submission.csv
