In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures, QuantileTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score
from feature_engineering import TitanicFeatureEngineer
import warnings
warnings.resetwarnings()
# Suppress specific FutureWarning from sklearn.pipeline
warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message="This Pipeline instance is not fitted yet.*",
    module="sklearn.pipeline"
)

In [2]:
data = pd.read_csv('../../data/train.csv')
y = data['Survived']
X = data.drop(columns=['Survived'])
X.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Original fetaures:
 - 'PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 
 - 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'

Added Features:
- 'Fare', 'FamilySize', 'FamilySizeCategory', 'TicketPrefix',
- 'TicketGroupSize', 'Deck', 'Title', 'SurnameGroupSize'

Dropped:
- 'Name', 'Ticket', 'Cabin', 'PassengerId', 

Shoudl we keep: 'SibSp' 'Parch',

In [3]:
# Define columns to transform
categorical_features = ['Pclass', 'Sex', 'Embarked',
                        'TicketPrefix', 'Deck', 'FamilySizeCategory',
                        'Title', 'SurnameGroupSize', 'TicketGroupSizeBin',
                        'FareBin', 'AgeBin']
numerical_features = ['Fare', 'Age', 'FamilySize', 'TicketGroupSize', 'SibSp', 'Parch']

# Define your mini pipelines
numerical_pipeline = Pipeline([
    ('scaler', StandardScaler()),
])

# Full preprocessing block
full_preprocessing = Pipeline([
    ('feature_engineering', TitanicFeatureEngineer()),  # Your custom feature adder
    ('preprocessing', ColumnTransformer([
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', numerical_pipeline, numerical_features)
    ]))
])

In [4]:
# Full ML pipeline: preprocessing + model
final_pipeline = Pipeline([
    ('full_preprocessing', full_preprocessing),
    # Placeholder — model will be set by GridSearchCV
    ('classifier', DummyClassifier())
])

In [5]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter space
param_grid = [
    {
        'classifier': [SVC(probability=True, random_state=42)],
        'classifier__kernel': ['rbf', 'linear'],  # optionally add 'poly'
        'classifier__C': [0.1, 1, 10],
        'classifier__gamma': ['scale', 'auto']
    },
    {
        'classifier': [DecisionTreeClassifier(random_state=42)],
        'classifier__max_depth': [3, 5, 7, None],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 5],
        'classifier__max_features': ['sqrt', 'log2', None]
    },
    {
        'classifier': [RandomForestClassifier(random_state=42)],
        'classifier__n_estimators': [200, 300, 400],
        'classifier__max_depth': [6, 8, 10, None],
        'classifier__min_samples_split': [2, 5],
        'classifier__max_features': ['sqrt', 'log2']
    },
    {
        'classifier': [LogisticRegression(max_iter=10_000, random_state=42)],
        'classifier__C': [0.01, 0.1, 1, 10],
        'classifier__solver': ['liblinear', 'lbfgs'],
        'classifier__penalty': ['l2']
    },
    {
        'classifier': [XGBClassifier(random_state=42)],
        'classifier__max_depth': [3, 4, 5, 6],
        'classifier__n_estimators': [50, 100, 200, 300],
        'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
        'classifier__subsample': [0.6, 0.8, 1.0],
        'classifier__colsample_bytree': [0.6, 0.8, 1.0]
    }
]

In [6]:
# PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked
# PassengerId,           Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked
submission = pd.read_csv('../../data/test.csv')
submission.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [7]:
# GridSearchCV
grid_search = GridSearchCV(
    final_pipeline,
    param_grid,
    cv=10,
    scoring='accuracy',
    verbose=3,
    n_jobs=-1  # Use all cores
)

grid_search.fit(X, y)

Fitting 10 folds for each of 752 candidates, totalling 7520 fits


  pid = os.fork()
  pid = os.fork()


[CV 1/10] END classifier=SVC(probability=True, random_state=42), classifier__C=0.1, classifier__gamma=scale, classifier__kernel=rbf;, score=0.822 total time=   0.1s
[CV 6/10] END classifier=SVC(probability=True, random_state=42), classifier__C=0.1, classifier__gamma=scale, classifier__kernel=rbf;, score=0.809 total time=   0.1s
[CV 10/10] END classifier=SVC(probability=True, random_state=42), classifier__C=0.1, classifier__gamma=scale, classifier__kernel=rbf;, score=0.831 total time=   0.1s
[CV 4/10] END classifier=SVC(probability=True, random_state=42), classifier__C=0.1, classifier__gamma=scale, classifier__kernel=rbf;, score=0.876 total time=   0.1s
[CV 2/10] END classifier=SVC(probability=True, random_state=42), classifier__C=0.1, classifier__gamma=scale, classifier__kernel=rbf;, score=0.843 total time=   0.1s
[CV 3/10] END classifier=SVC(probability=True, random_state=42), classifier__C=0.1, classifier__gamma=scale, classifier__kernel=rbf;, score=0.764 total time=   0.1s
[CV 3/10]

In [8]:
# Print best parameters and model
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best model: {grid_search.best_estimator_}")

# Evaluate best model on test set
best_pipeline = grid_search.best_estimator_

Best parameters: {'classifier': XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              feature_weights=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=None,
              n_jobs=None, num_parallel_tree=None, ...), 'classifier__colsample_bytree': 1.0, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 5, 'classifier__n_estimators': 50, 'classifier__subsample': 0.6}
Best model: Pipeline(steps=[('full_preprocess

In [9]:
feature_importances = best_pipeline.named_steps['classifier'].feature_importances_
feature_names = (best_pipeline.named_steps['full_preprocessing']
                 .named_steps['preprocessing']
                 .get_feature_names_out())
importance_df = pd.DataFrame(
    {'Feature': feature_names, 'Importance': feature_importances})
print(importance_df.sort_values(by='Importance', ascending=False))

                           Feature  Importance
3                  cat__Sex_female    0.145845
25                   cat__Title_Mr    0.126403
2                    cat__Pclass_3    0.060547
23               cat__Title_Master    0.042000
0                    cat__Pclass_1    0.037612
38   cat__TicketGroupSizeBin_Large    0.036354
21   cat__FamilySizeCategory_Small    0.035523
15                     cat__Deck_E    0.028640
53                 num__FamilySize    0.025792
29            cat__Title_RareTitle    0.022058
39   cat__TicketGroupSizeBin_Small    0.019112
20  cat__FamilySizeCategory_Medium    0.018846
16                     cat__Deck_F    0.017417
11                     cat__Deck_A    0.016574
32         cat__SurnameGroupSize_3    0.015734
7                  cat__Embarked_S    0.015379
5                  cat__Embarked_C    0.014618
54            num__TicketGroupSize    0.014397
44                  cat__FareBin_3    0.014321
51                       num__Fare    0.014301
30         ca

In [10]:
# Make predictions on test set
y_pred = best_pipeline.predict(X)
y_test = y

# Calculate accuracy
accuracy = accuracy_score(y, y_pred)
print(f"Test set accuracy: {accuracy:.4f}")

Test set accuracy: 0.9035


In [11]:
# Print classification report
print(classification_report(y_test, y_pred))
# Print confusion matrix
print(confusion_matrix(y_test, y_pred))
# Print precision, recall, and F1 score
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")

              precision    recall  f1-score   support

           0       0.89      0.96      0.92       549
           1       0.93      0.81      0.87       342

    accuracy                           0.90       891
   macro avg       0.91      0.89      0.90       891
weighted avg       0.91      0.90      0.90       891

[[527  22]
 [ 64 278]]
Precision: 0.9267
Recall: 0.8129


In [12]:
# Print confusion matrix
print(confusion_matrix(y_test, y_pred))
# Print precision, recall, and F1 score
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1 score: {f1_score(y_test, y_pred):.4f}")

[[527  22]
 [ 64 278]]
Precision: 0.9267
Recall: 0.8129
F1 score: 0.8660


In [13]:
submission[:5]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [14]:
submission_predictions = best_pipeline.predict(submission)

In [15]:
# Let us run the model on the test set
# (Optional) Create a Kaggle submission file
submission_df = pd.DataFrame({
    'PassengerId': submission["PassengerId"],   # must have PassengerId stored somewhere
    'Survived': submission_predictions
})
submission_df.to_csv('submission-5.csv', index=False)