<a href="https://colab.research.google.com/github/mmtondreau/Kaggle/blob/main/Titanic/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [26]:
import os
import json
import zipfile
from google.colab import drive
import shutil
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import lightgbm as lgb
from lightgbm import log_evaluation, early_stopping
from catboost import CatBoostRegressor, Pool
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingRegressor

# Mount our Kaggle API Key

Go to https://www.kaggle.com/account

Under API, click Create New API Token

This will download a file called kaggle.json
Place the key in your google drive at secrets/kaggle.jso

In [5]:
# drive.mount('/content/drive')
os.makedirs("/root/config/.kaggle", exist_ok=True)
shutil.copy("/content/drive/MyDrive/secrets/kaggle.json", "/root/.config/kaggle/kaggle.json")
os.chmod("/root/.kaggle/kaggle.json", 0o600)

# Download Training and Test Data


In [15]:
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()
display(api.competition_list_files('titanic').files)
api.competition_download_files('titanic', path='data')

zip_path = 'data/titanic.zip'
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall('data')

# Optional: remove zip file afterward
os.remove(zip_path)

[{"ref": "", "name": "gender_submission.csv", "description": "", "totalBytes": 3258, "url": "", "creationDate": "2018-04-09T05:33:22.396Z"},
 {"ref": "", "name": "test.csv", "description": "", "totalBytes": 28629, "url": "", "creationDate": "2018-04-09T05:33:22.396Z"},
 {"ref": "", "name": "train.csv", "description": "", "totalBytes": 61194, "url": "", "creationDate": "2018-04-09T05:33:22.396Z"}]

In [30]:
train_raw_df =  pd.read_csv("data/train.csv", index_col="PassengerId")
test_raw_df =  pd.read_csv("data/train.csv", index_col="PassengerId")

y_full = train_raw_df.pop("Survived").astype(float)

In [20]:
cat_cols = train_raw_df.select_dtypes(include=["object", "string", "category"]).columns.tolist()

for df in (train_raw_df, test_raw_df):
    for col in cat_cols:
        df[col] = (
            df[col]
            .astype("string")
            .fillna("Missing")
            .replace({"nan": "Missing", "None": "Missing", "": "Missing"})
            .astype("category")
        )

cat_features = cat_cols.copy()  # column names for CatBoost

In [21]:
def normalize_numeric(df: pd.DataFrame) -> pd.DataFrame:
    num_cols = df.select_dtypes("number").columns
    df[num_cols] = (df[num_cols] - df[num_cols].mean()) / df[num_cols].std().replace(0, 1)
    return df


def one_hot(df: pd.DataFrame, cols):
    return pd.get_dummies(df, columns=cols, dummy_na=True, drop_first=True)


def to_lgb_matrix(df: pd.DataFrame) -> pd.DataFrame:
    out = one_hot(df.copy(), cat_cols)
    out = normalize_numeric(out)
    num_cols = out.select_dtypes("number").columns
    out[num_cols] = out[num_cols].fillna(0.0)
    return out.astype(np.float32)


In [22]:
X_lgb_train   = to_lgb_matrix(train_raw_df)
X_lgb_test_raw = to_lgb_matrix(test_raw_df)
# Align test to train column set (one‑hot can create extras)
X_lgb_test = X_lgb_test_raw.reindex(columns=X_lgb_train.columns, fill_value=0.0).astype(np.float32)

X_cb_train    = normalize_numeric(train_raw_df.copy())
X_cb_test     = normalize_numeric(test_raw_df.copy())

In [31]:
train_idx, val_idx = train_test_split(np.arange(len(X_lgb_train)), test_size=0.15, random_state=42)

Xtr_lgb, Xval_lgb = X_lgb_train.iloc[train_idx].to_numpy(), X_lgb_train.iloc[val_idx].to_numpy()
Xtr_cb , Xval_cb  = X_cb_train.iloc[train_idx],              X_cb_train.iloc[val_idx]

ytr, yval = y_full.iloc[train_idx], y_full.iloc[val_idx]

In [32]:
train_pool = Pool(Xtr_cb, ytr, cat_features=cat_features)
val_pool   = Pool(Xval_cb, yval, cat_features=cat_features)
model_cb = CatBoostRegressor(iterations=5000, learning_rate=0.03, depth=8, loss_function="RMSE",
                             eval_metric="RMSE", random_seed=42, l2_leaf_reg=3, bagging_temperature=1.0,
                             od_type="Iter", od_wait=200, verbose=300)
model_cb.fit(train_pool, eval_set=val_pool, use_best_model=True)
print("CatBoost best RMSE:", model_cb.best_score_['validation']['RMSE'])

0:	learn: 0.4704226	test: 0.4801314	best: 0.4801314 (0)	total: 47.1ms	remaining: 3m 55s
300:	learn: 0.0039385	test: 0.0032691	best: 0.0032677 (299)	total: 1.4s	remaining: 21.8s
600:	learn: 0.0018219	test: 0.0022254	best: 0.0022254 (600)	total: 3.41s	remaining: 24.9s
900:	learn: 0.0011087	test: 0.0019950	best: 0.0019950 (900)	total: 5.61s	remaining: 25.5s
1200:	learn: 0.0008269	test: 0.0019065	best: 0.0019065 (1200)	total: 7.79s	remaining: 24.6s
1500:	learn: 0.0006390	test: 0.0018834	best: 0.0018818 (1476)	total: 10.1s	remaining: 23.5s
1800:	learn: 0.0004975	test: 0.0018731	best: 0.0018721 (1716)	total: 14.7s	remaining: 26.1s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.001872063013
bestIteration = 1716

Shrink model to first 1717 iterations.
CatBoost best RMSE: 0.001872063013085006
