In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score


from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()

        # Title from Name
        df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
        df['Title'] = df['Title'].replace(['Mlle', 'Ms'], 'Miss')
        df['Title'] = df['Title'].replace(['Mme'], 'Mrs')
        df['Title'] = df['Title'].replace(
            ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 
             'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

        # IsAlone
        df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
        df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

        # AgeBin and FareBin (you could also use KBinsDiscretizer)
        df['AgeBin'] = pd.cut(df['Age'], bins=[0, 10, 20, 30, 40, 50, 60, 80], labels=False)
        df['FareBin'] = pd.qcut(df['Fare'], 4, labels=False, duplicates='drop')

        return df

In [3]:
# Load data
df = pd.read_csv("train.csv")

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [4]:
num_features = ['AgeBin', 'FareBin']
cat_features = ['Sex', 'Embarked', 'Pclass', 'Title', 'IsAlone']

# Transformers
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Column Transformer
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])

In [24]:
# Grid search over hyperparameters
# this setting maps to 0.78708
param_grid1 = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [4, 6, 8],
    'classifier__min_samples_split': [2, 5, 10]
}

# param_grid = param_grid1

In [None]:
param_grid2 = {
    'classifier__n_estimators': [1000, 1100, 1500],
    'classifier__max_depth': [3, 4, 5, 6, 8, 10],
    'classifier__min_samples_split': [4, 6, 8, 10],
    'classifier__min_samples_leaf': [1, 2, 5, 10]
}

# {'classifier__max_depth': 8,
#  'classifier__min_samples_leaf': 2,
#  'classifier__min_samples_split': 10,
#  'classifier__n_estimators': 1000}

In [16]:
full_pipeline = Pipeline([
    ('feature_engineering', FeatureEngineer()),
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Grid search over hyperparameters
# this setting maps to 0.78708
# param_grid = {
#     'classifier__n_estimators': [100, 200],
#     'classifier__max_depth': [4, 6, 8],
#     'classifier__min_samples_split': [2, 5, 10]
# }

param_grid = {
    'classifier__n_estimators': [1000, 1100, 1500],
    'classifier__max_depth': [3, 4, 5, 6, 8, 10],
    'classifier__min_samples_split': [4, 6, 8, 10],
    'classifier__min_samples_leaf': [1, 2, 5, 10]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    full_pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)

https://medium.com/@kalpit.sharma/mastering-random-forest-hyperparameter-tuning-for-enhanced-machine-learning-models-2d1a8c6c426f

This post talks about how to efficiently conduct hyperparameter tuning.

In [17]:
# Create target variable
y_train = df['Survived']

# Select input features (raw input before feature engineering)
X_train = df[['Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Pclass']]

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_



Fitting 5 folds for each of 288 candidates, totalling 1440 fits


In [18]:
grid_search.best_params_

{'classifier__max_depth': 8,
 'classifier__min_samples_leaf': 2,
 'classifier__min_samples_split': 10,
 'classifier__n_estimators': 1000}

In [13]:
print(best_model)

Pipeline(steps=[('feature_engineering', FeatureEngineer()),
                ('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['AgeBin', 'FareBin']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),

In [19]:
# Predict on test.csv
test_df = pd.read_csv("test.csv")
X_submission = test_df.copy()

submission_preds = best_model.predict(X_submission)

submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': submission_preds
})
submission.to_csv("submission_rf_ht_fe.csv", index=False)

In [21]:
from sklearn.ensemble import VotingClassifier

In [22]:
# Pipeline
lr_pipeline = Pipeline([
    ('feature_engineering', FeatureEngineer()),
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

lr_pipeline.fit(X_train, y_train)

In [25]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

grid_search1 = GridSearchCV(
    full_pipeline,
    param_grid=param_grid1,
    cv=cv,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)

grid_search1.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [26]:
ensemble_model = VotingClassifier(
    estimators=[
        ('rf1', lr_pipeline),  # trained pipeline 1
        ('rf2', grid_search1),  # trained pipeline 2
        ('rf3', grid_search)   # trained pipeline 3
    ],
    voting='soft'  # 'soft' = average probabilities; 'hard' = majority vote
)

ensemble_model.fit(X_train, y_train)

X_submission = test_df.copy()

submission_preds = ensemble_model.predict(X_submission)

submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': submission_preds
})
submission.to_csv("submission_ensemble.csv", index=False)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Fitting 5 folds for each of 288 candidates, totalling 1440 fits


https://www.kaggle.com/code/nargisbegum82/hyperparameter-tuning-in-random-forests

https://www.kaggle.com/code/gunesevitan/titanic-advanced-feature-engineering-tutorial?scriptVersionId=27280410