In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
# kernel Python 3.11.11 /local/bin/python

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
! pwd
! pip install -e ..  -r requirements.txt

/kaggle/working/kaggle/titanic


Obtaining file:///kaggle/working/kaggle
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: kaggle_common
  Attempting uninstall: kaggle_common
    Found existing installation: kaggle_common 0.1.0
    Uninstalling kaggle_common-0.1.0:
      Successfully uninstalled kaggle_common-0.1.0
  Running setup.py develop for kaggle_common
Successfully installed kaggle_common-0.1.0


In [3]:
import sys, os

repo_root = "/kaggle/working/kaggle"
if repo_root not in sys.path:
    sys.path.insert(0, repo_root)
import warnings
warnings.filterwarnings(
    "ignore",
    message="Found unknown categories in columns [0, 2, 3] during transform. These unknown categories will be encoded as all zeros",
    category=UserWarning
)
from common.preprocessing import make_preprocessor
from common.pipelines     import compare_models, grid_search
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from titanic_features import add_features
import joblib
from xgboost import XGBClassifier  
from sklearn.metrics       import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint, uniform

In [4]:
# load straight from Kaggle’s mount
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test  = pd.read_csv('/kaggle/input/titanic/test.csv')

In [5]:
# apply to train & capture bins
train, fare_bins, age_fill, fare_fill, embarked_fill = add_features(train)

# apply to test _using_ the same fare_bins
test, _, _, _, _ = add_features(
    test,
    fare_bins=fare_bins,
    age_fill=age_fill,
    fare_fill=fare_fill,
    embarked_fill=embarked_fill
)

NUM_COLS = ['Age','Fare','FamilySize']
CAT_COLS = ['Pclass','Sex','Embarked',
            'Title','IsAlone','Deck','FareBin','AgeBin']

TARGET   = 'Survived'

X = train[NUM_COLS + CAT_COLS]
y = train[TARGET]
X_test  = test[NUM_COLS  + CAT_COLS]

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.20,
    random_state=42,
    stratify=y
)
# 1) Load your preprocessor
preprocessor = joblib.load("titanic_preprocessor.joblib")

# 2) (Re‐create or load) your XGB model
pipeline = Pipeline([
    ("preproc", preprocessor),  
    ("xgb",     XGBClassifier(n_estimators=200, random_state=42)),
])

# 4) Fit on your training set
pipeline.fit(X_train, y_train)

# 5) Score or predict
print("Train score:", pipeline.score(X_train, y_train))


Train score: 0.9831460674157303


In [6]:
val_preds = pipeline.predict(X_val)
print("Val Accuracy:", accuracy_score(y_val, val_preds))
print(classification_report(y_val, val_preds))

Val Accuracy: 0.8156424581005587
              precision    recall  f1-score   support

           0       0.85      0.85      0.85       110
           1       0.76      0.75      0.76        69

    accuracy                           0.82       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.82      0.82      0.82       179



# Hyperparameter search

In [7]:
# 3) Build a pipeline
pipeline = Pipeline([
    ("preproc", preprocessor),
    ("xgb",     XGBClassifier(use_label_encoder=False, eval_metric="logloss"))
])

# 4) Define hyperparameter distributions
param_dist = {
    "xgb__n_estimators":    randint(100, 1000),
    "xgb__max_depth":       randint(3, 8),
    "xgb__learning_rate":   uniform(0.01, 0.3),
    "xgb__subsample":       uniform(0.6, 0.4),
    "xgb__colsample_bytree":uniform(0.6, 0.4),
    "xgb__gamma":           uniform(0, 5),
    "xgb__reg_alpha":       uniform(0, 5),
    "xgb__reg_lambda":      uniform(0, 5),
}

# 5) Set up RandomizedSearchCV
search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=50,               # number of settings sampled
    scoring="accuracy",      # or "roc_auc"
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1,
)

# 6) Run the search on your training split
search.fit(X_train, y_train)

# 7) Inspect the results
print("Best CV accuracy:   ", search.best_score_)
print("Best hyperparameters:", search.best_params_)

# 8) Evaluate on your hold-out validation set
val_score = search.score(X_val, y_val)
print("Validation accuracy:", val_score)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[CV] END xgb__colsample_bytree=0.749816047538945, xgb__gamma=4.75357153204958, xgb__learning_rate=0.22959818254342154, xgb__max_depth=7, xgb__n_estimators=120, xgb__reg_alpha=0.7800932022121826, xgb__reg_lambda=0.7799726016810132, xgb__subsample=0.6232334448672797; total time=   0.1s
[CV] END xgb__colsample_bytree=0.749816047538945, xgb__gamma=4.75357153204958, xgb__learning_rate=0.22959818254342154, xgb__max_depth=7, xgb__n_estimators=120, xgb__reg_alpha=0.7800932022121826, xgb__reg_lambda=0.7799726016810132, xgb__subsample=0.6232334448672797; total time=   0.1s
[CV] END xgb__colsample_bytree=0.749816047538945, xgb__gamma=4.75357153204958, xgb__learning_rate=0.22959818254342154, xgb__max_depth=7, xgb__n_estimators=120, xgb__reg_alpha=0.7800932022121826, xgb__reg_lambda=0.7799726016810132, xgb__subsample=0.6232334448672797; total time=   0.1s
[CV] END xgb__colsample_bytree=0.749816047538945, xgb__gamma=4.75357153204958, xgb__learning_rate=0.22959818254342154, xgb__max_depth=7, xgb__n_e

In [8]:
best_pipeline = search.best_estimator_
best_pipeline.fit(X, y)
preds = best_pipeline.predict(X_test)

In [9]:
# predict test set
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived':    best_pipeline.predict(X_test)
})
submission.to_csv('output/submission_xgboost-features-explicit.csv', index=False)

# Improvements

#Advanced feature engineering via out-of-fold (k-fold) target encoding
#Early stopping baked into your XGBoost training
#Stacked ensembling of XGBoost, LightGBM and CatBoost for a final meta-model

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from category_encoders import TargetEncoder
from sklearn.model_selection import KFold
import pandas as pd

class KFoldTargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols, n_splits=5, shuffle=True, random_state=42, smoothing=0.2):
        self.cols = cols
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state
        self.smoothing = smoothing

    def fit(self, X, y):
        # reset indices for reliable alignment
        X = X.reset_index(drop=True)
        y = y.reset_index(drop=True)

        # store the global mean for fallback
        self.global_mean_ = y.mean()

        # prepare an empty out-of-fold frame
        oof = pd.DataFrame(index=X.index, columns=self.cols, dtype=float)

        # 1) build OOF encodings
        kf = KFold(n_splits=self.n_splits,
                   shuffle=self.shuffle,
                   random_state=self.random_state)
        for train_idx, val_idx in kf.split(X):
            Xt, yt = X.iloc[train_idx], y.iloc[train_idx]
            Xv      = X.iloc[val_idx]

            for col in self.cols:
                te = TargetEncoder(cols=[col], smoothing=self.smoothing)
                te.fit(Xt[col], yt)
                # transform returns a DataFrame
                oof.loc[val_idx, col] = te.transform(Xv[col])[col]

        # fill any remaining NaNs with the global mean
        self.oof_ = oof.fillna(self.global_mean_)

        # 2) fit a “full-data” encoder for test/new data
        self.full_encoders_ = {}
        for col in self.cols:
            fe = TargetEncoder(cols=[col], smoothing=self.smoothing)
            fe.fit(X[col], y)
            self.full_encoders_[col] = fe

        return self

    def transform(self, X):
        X = X.reset_index(drop=True).copy()

        # if we’re transforming the *same* number of rows as we fit, assume train
        if len(X) == len(self.oof_):
            return self.oof_

        # otherwise (test/new data), apply the full-data encoder
        X_enc = pd.DataFrame(index=X.index)
        for col in self.cols:
            te = self.full_encoders_[col]
            X_enc[col] = te.transform(X[col])[col].fillna(self.global_mean_)

        return X_enc


In [25]:

# assume `train` DataFrame with target `y`
te_cols = ['Pclass','Sex','Embarked','Title','Deck','FareBin','AgeBin']
# kf_te = KFoldTargetEncoder(cols=te_cols, n_splits=5)
# kf_te.fit(train[te_cols], train[TARGET])

# # apply to both train and test
# train_enc = kf_te.transform(train[te_cols])
# test_enc  = kf_te.transform(test[te_cols])

In [None]:
# from sklearn.pipeline import Pipeline
# from sklearn.model_selection import RandomizedSearchCV, train_test_split
# from xgboost import XGBClassifier







# # 1) Prepare your X and y (with k-fold TE applied)
# X = pd.concat([train[NUM_COLS], train_enc], axis=1)
# y = train[TARGET]

# X_tr, X_val, y_tr, y_val = train_test_split(
#     X, y, test_size=0.2, random_state=42, stratify=y
# )

# # 2) Build pipeline (no OHE now; we’ve encoded numerics + targets)
# pipe = Pipeline([
#     ("xgb", XGBClassifier(use_label_encoder=False, eval_metric="logloss",early_stopping_rounds=50))
# ])

# # 3) Parameter grid
# param_dist = {
#     "xgb__n_estimators":    [500, 1000],
#     "xgb__max_depth":       [3,4,5],
#     "xgb__learning_rate":   [0.01,0.05,0.1],
#     "xgb__subsample":       [0.7, 0.8, 0.9],
#     "xgb__colsample_bytree":[0.7, 0.8, 0.9],
#     "xgb__gamma":           [0, 1, 5],
# }

# search = RandomizedSearchCV(
#     estimator=pipe,
#     param_distributions=param_dist,
#     n_iter=20,
#     scoring="accuracy",
#     cv=5,
#     verbose=2,
#     random_state=42,
#     n_jobs=-1,
#     refit=True
# )

# # 5) Fit, still passing ONLY eval_set & verbose into fit()
# #     early_stopping_rounds lives in the constructor now
# search.fit(
#     X_tr,
#     y_tr,
#     xgb__eval_set=[(X_val, y_val)],
#     xgb__verbose=False
# )

# print("Best CV:",    search.best_score_)
# print("Best params:",search.best_params_)


Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END xgb__colsample_bytree=0.9, xgb__gamma=1, xgb__learning_rate=0.1, xgb__max_depth=4, xgb__n_estimators=500, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.9, xgb__gamma=1, xgb__learning_rate=0.1, xgb__max_depth=4, xgb__n_estimators=500, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.9, xgb__gamma=1, xgb__learning_rate=0.1, xgb__max_depth=4, xgb__n_estimators=500, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.9, xgb__gamma=1, xgb__learning_rate=0.1, xgb__max_depth=4, xgb__n_estimators=500, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.9, xgb__gamma=1, xgb__learning_rate=0.1, xgb__max_depth=4, xgb__n_estimators=500, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.8, xgb__gamma=1, xgb__learning_rate=0.1, xgb__max_depth=4, xgb__n_estimators=1000, xgb__subsample=0.8; total time=   0.1s
[CV] END xgb_

[CV] END xgb__colsample_bytree=0.8, xgb__gamma=1, xgb__learning_rate=0.1, xgb__max_depth=4, xgb__n_estimators=1000, xgb__subsample=0.8; total time=   0.1s
[CV] END xgb__colsample_bytree=0.9, xgb__gamma=1, xgb__learning_rate=0.1, xgb__max_depth=5, xgb__n_estimators=1000, xgb__subsample=0.9; total time=   0.1s
[CV] END xgb__colsample_bytree=0.9, xgb__gamma=1, xgb__learning_rate=0.1, xgb__max_depth=5, xgb__n_estimators=1000, xgb__subsample=0.9; total time=   0.1s
[CV] END xgb__colsample_bytree=0.8, xgb__gamma=1, xgb__learning_rate=0.1, xgb__max_depth=4, xgb__n_estimators=1000, xgb__subsample=0.8; total time=   0.1s
[CV] END xgb__colsample_bytree=0.9, xgb__gamma=1, xgb__learning_rate=0.1, xgb__max_depth=5, xgb__n_estimators=1000, xgb__subsample=0.9; total time=   0.1s
[CV] END xgb__colsample_bytree=0.9, xgb__gamma=1, xgb__learning_rate=0.1, xgb__max_depth=5, xgb__n_estimators=1000, xgb__subsample=0.9; total time=   0.1s
[CV] END xgb__colsample_bytree=0.9, xgb__gamma=1, xgb__learning_rate=0

In [26]:
from sklearn.model_selection import train_test_split

# 0) Raw DataFrame with all your added features already applied
#    (i.e. you’ve run add_features on the full train.csv to get
#     columns NUM_COLS + te_cols, but not yet target-encoded)
raw_X = pd.concat([train[NUM_COLS], train[te_cols]], axis=1)
raw_y = train[TARGET]

# 1) Split *before* any target-encoding
X_tr_raw, X_val_raw, y_tr, y_val = train_test_split(
    raw_X, raw_y,
    test_size=0.20,
    random_state=42,
    stratify=raw_y
)

# 2) Fit the K-fold target encoder ONLY on the *training* piece
kf_te = KFoldTargetEncoder(cols=te_cols, n_splits=5)
kf_te.fit(X_tr_raw[te_cols], y_tr)

# 3) Transform train, val, and test
train_enc = kf_te.transform(X_tr_raw[te_cols])
val_enc   = kf_te.transform(X_val_raw[te_cols])
test_enc  = kf_te.transform(test[te_cols])   # test[te_cols] was built with the same add_features

# 4) Build your final feature matrices
X_tr = pd.concat([X_tr_raw[NUM_COLS].reset_index(drop=True),
                  train_enc.reset_index(drop=True)], axis=1)
X_val= pd.concat([X_val_raw[NUM_COLS].reset_index(drop=True),
                  val_enc.reset_index(drop=True)],   axis=1)

# now you can run your RandomizedSearchCV exactly as before, using X_tr/y_tr
# and passing eval_set=[(X_val,y_val)] into .fit()

search.fit(
    X_tr, y_tr,
    xgb__eval_set=[(X_val, y_val)],
    xgb__verbose=False
)

# finally, after you pick best_estimator_, you can do:
final_preds = search.best_estimator_.predict(
    pd.concat([test[NUM_COLS], test_enc], axis=1)
)


Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END xgb__colsample_bytree=0.9, xgb__gamma=1, xgb__learning_rate=0.1, xgb__max_depth=4, xgb__n_estimators=500, xgb__subsample=0.7; total time=   0.1s[CV] END xgb__colsample_bytree=0.9, xgb__gamma=1, xgb__learning_rate=0.1, xgb__max_depth=4, xgb__n_estimators=500, xgb__subsample=0.7; total time=   0.1s

[CV] END xgb__colsample_bytree=0.9, xgb__gamma=1, xgb__learning_rate=0.1, xgb__max_depth=4, xgb__n_estimators=500, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.9, xgb__gamma=1, xgb__learning_rate=0.1, xgb__max_depth=4, xgb__n_estimators=500, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.9, xgb__gamma=1, xgb__learning_rate=0.1, xgb__max_depth=4, xgb__n_estimators=500, xgb__subsample=0.7; total time=   0.1s
[CV] END xgb__colsample_bytree=0.8, xgb__gamma=1, xgb__learning_rate=0.1, xgb__max_depth=4, xgb__n_estimators=1000, xgb__subsample=0.8; total time=   0.1s
[CV] END xgb_

In [27]:
from sklearn.ensemble import StackingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# pull out just the XGB params from your search
best = search.best_params_
xgb_params = {
    k.split("__",1)[1]: v
    for k,v in best.items()
    if k.startswith("xgb__")
}

# now build a new classifier WITHOUT early_stopping_rounds
xgb_final = XGBClassifier(
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42,
    **xgb_params
)


# 1) Define base learners with best params (from your search)
# xgb_final  = search.best_estimator_.named_steps['xgb']
lgb_final  = LGBMClassifier(n_estimators=500, learning_rate=0.05,
                            subsample=0.8, colsample_bytree=0.8, random_state=42)
cat_final  = CatBoostClassifier(iterations=500, learning_rate=0.05,
                                depth=4, verbose=False, random_state=42)

# 2) Stacking meta-learner
stack = StackingClassifier(
    estimators=[
        ('xgb', xgb_final),
        ('lgb', lgb_final),
        ('cat', cat_final),
    ],
    final_estimator=XGBClassifier(
        n_estimators=200, learning_rate=0.05, random_state=42,
        use_label_encoder=False, eval_metric="logloss"
    ),
    cv=5,
    n_jobs=-1,
    passthrough=True  # let meta-learner see original features too
)

# 3) Fit on full train data
X_full = pd.concat([train[NUM_COLS], train_enc], axis=1)
y_full = train[TARGET]
stack.fit(X_full, y_full)

# 4) Predict
X_test_full = pd.concat([test[NUM_COLS], test_enc], axis=1)
preds = stack.predict(X_test_full)


[LightGBM] [Info] Number of positive: 342, number of negative: 549
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000094 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 356
[LightGBM] [Info] Number of data points in the train set: 891, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383838 -> initscore=-0.473288
[LightGBM] [Info] Start training from score -0.473288
[LightGBM] [Info] Number of positive: 273, number of negative: 439
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000083 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 335
[LightGBM] [Info] Number of data points in the train set: 712, number of used features: 10
[LightGBM] [Info] Number of positive: 274, number of negative: 439
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.00007

In [14]:
from sklearn.ensemble import StackingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

# 0) re-extract your best XGB params
best = search.best_params_
xgb_params = {
    k.split("__",1)[1]: v
    for k, v in best.items()
    if k.startswith("xgb__")
}

# 1) base learners

# XGB w/ your tuned params (no early stopping here)
xgb_final = XGBClassifier(
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42,
    **xgb_params
)

# LightGBM: allow smaller leaves and fewer data per leaf,
# + verbose=-1 to silence that warning
lgb_final = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_samples=5,    # default is 20 → too big for a small data subset
    min_split_gain=0.0,     # allow any small gain
    verbose=-1,
    random_state=42
)

# CatBoost: your existing setup
cat_final = CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    depth=4,
    verbose=False,
    random_state=42
)

# 2) stacking meta‐learner
stack = StackingClassifier(
    estimators=[
        ('xgb', xgb_final),
        ('lgb', lgb_final),
        ('cat', cat_final),
    ],
    final_estimator=XGBClassifier(
        n_estimators=200,
        learning_rate=0.05,
        random_state=42,
        use_label_encoder=False,
        eval_metric="logloss"
    ),
    cv=5,
    n_jobs=-1,
    passthrough=True,
)

# 3) fit & predict
X_full      = pd.concat([train[NUM_COLS], train_enc], axis=1)
y_full      = train[TARGET]
X_test_full = pd.concat([test[NUM_COLS],  test_enc ], axis=1)

stack.fit( X_full, y_full )
preds = stack.predict( X_test_full )


In [None]:
# predict test set
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived':    stack.predict(X_test_full)
})
submission.to_csv('output/submission_xgboost-ensemble-early_stop-features.csv', index=False)

In [28]:
print("Local val accuracy:", stack.score(X_val, y_val))

Local val accuracy: 0.6983240223463687


In [19]:
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# (re-split your original train into train2/val2 exactly as before)
X = pd.concat([train[NUM_COLS], train_enc], axis=1)
y = train[TARGET]


X2_tr, X2_val, y2_tr, y2_val = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# build a simple pipeline
simple_pipe = Pipeline([
    ("xgb", XGBClassifier(
        use_label_encoder=False,
        eval_metric="logloss",
        **xgb_params  # from your search.best_params_
    ))
])

simple_pipe.fit(X2_tr, y2_tr)
print("Simple XGB val accuracy:", simple_pipe.score(X2_val, y2_val))


Simple XGB val accuracy: 0.7988826815642458
