In [467]:
import pandas as pd
import sklearn as sk
import seaborn as sns
import matplotlib.pyplot as plt
import boruta as br
import numpy as np

In [468]:
#Reading in test and train datasets
filepath = './data/'
df_train = pd.read_csv(filepath + 'train.csv')
df_test = pd.read_csv(filepath + 'test.csv')

In [469]:
#Setting up pre-processing pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn import set_config

#Telling sklearn that we want to output dataframes
set_config(transform_output='pandas')

#Defining numerical and categorical columns in df_train
cat_vars = ['HomePlanet', 'CryoSleep', 'Destination', 'Cabin', 'VIP']
num_vars = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [470]:
#Checking label balance
df_train['Transported'].value_counts()

True     4378
False    4315
Name: Transported, dtype: int64

In [471]:
from sklearn.base import BaseEstimator, TransformerMixin


class CatVariablesAdder(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        '''Creates new categorical variables.
            starboard: whether the cabin is on the starboard side of the ship
            deck: the deck the passenger is on
        '''
        X['starboard'] = X.apply(lambda x: 1 if x['Cabin'][-1] == 'S' else 0, axis=1)
        X['deck'] = X['Cabin'].str[0]
        return X
    
class NumVariablesAdder(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        '''Creates new numerical variables.
            roommates: the number of roommates the passenger has
        '''
        X['roommates'] = self.columns.map(self.columns.value_counts())
        return X

class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X.drop(self.columns, axis=1, inplace=True)
        return X

In [472]:
#Defining the data pipelines
from sklearn.pipeline import Pipeline

num_pipeline = Pipeline([
    ('new_variables_adder', NumVariablesAdder(df_train['Cabin'])),
    ('imputer', KNNImputer(n_neighbors=5)),
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('new_variables_adder', CatVariablesAdder()),
    ('column dropper', ColumnDropper('Cabin')),
    ('one_hot_encoder', OneHotEncoder(sparse_output = False)),
])

In [473]:
#Adding data pipelines to Column Transformer
from sklearn.compose import ColumnTransformer

data_pipeline = ColumnTransformer(
        ([
            ('numerical', num_pipeline, num_vars),
            ('categorical', cat_pipeline, cat_vars)
        ]), 
    verbose_feature_names_out=False
)

#Transforming the data
df_train_processed = data_pipeline.fit_transform(df_train)
df_test_processed = data_pipeline.fit_transform(df_test)

In [474]:
df_train_processed

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,roommates,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,...,starboard_0,starboard_1,deck_A,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,deck_T
0,0.709765,-0.338158,-0.283440,-0.287916,-0.273675,-0.265852,-0.591589,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.334410,-0.173426,-0.277804,-0.246128,0.213874,-0.227071,-0.591589,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2.032386,-0.273172,1.955632,-0.287916,5.689696,-0.222664,0.159343,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.292095,-0.338158,0.519896,0.332218,2.682701,-0.095742,0.159343,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.891303,0.119766,-0.239610,-0.035516,0.228083,-0.264090,-0.591589,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0.848988,-0.338158,3.986199,-0.287916,1.185419,-0.200629,-0.591589,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8689,-0.752079,-0.338158,-0.283440,-0.287916,-0.273675,-0.265852,-0.591589,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8690,-0.195186,-0.338158,-0.283440,2.841167,-0.272787,-0.265852,-0.591589,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8691,0.222483,-0.338158,0.373380,-0.287916,0.039813,2.585472,0.159343,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [475]:
#Adding modeling to pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

rf_model_pipeline = Pipeline([
    ('preprocessor', data_pipeline),
    ('model', RandomForestClassifier(n_estimators=100, 
                                      max_depth=10, 
                                      random_state=123, 
                                      verbose=0))
])

ada_model_pipeline = Pipeline([
    ('preprocessor', data_pipeline),
    ('model', AdaBoostClassifier(n_estimators=100,
                                        learning_rate=0.1,
                                        random_state=123))
])

gb_model_pipeline = Pipeline([
    ('preprocessor', data_pipeline),
    ('model', GradientBoostingClassifier(n_estimators=100,
                                            max_depth=10,
                                            learning_rate=0.1,
                                            random_state=123,
                                            verbose=0))
])

#Defining train and test splits for preprocessed train data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_train, df_train['Transported'], test_size=0.2, random_state=123)

In [476]:
#Defining grid parameters for random forest
rf_grid_params = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [5, 10],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

#Defining grid parameters for gradient boosting
gb_grid_params = {
    'model__n_estimators': [100, 300],
    'model__max_depth': [5, 10],
    'model__learning_rate': [0.1, 0.05],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2]
}

#Defining grid parameters for adaboost
ada_grid_params = {
    'model__n_estimators': [100, 200, 300],
    'model__learning_rate': [0.1, 0.05, 0.01]
}

#Defining grid search for random forest
rf_grid_search = GridSearchCV(rf_model_pipeline, rf_grid_params, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
rf_grid_search.fit(x_train, y_train)

#Defining grid search for gradient boosting
gb_grid_search = GridSearchCV(gb_model_pipeline, gb_grid_params, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
gb_grid_search.fit(x_train, y_train)

#Defining grid search for adaboost
ada_grid_search = GridSearchCV(ada_model_pipeline, ada_grid_params, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
ada_grid_search.fit(x_train, y_train)


#Predicting on test data
rf_pred = rf_grid_search.predict(x_test)
ada_pred = ada_grid_search.predict(x_test)
gb_pred = gb_grid_search.predict(x_test)

#Evaluating the models
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print('Random Forest Accuracy: ', accuracy_score(y_test, rf_pred))
print('AdaBoost Accuracy: ', accuracy_score(y_test, ada_pred))
print('Gradient Boosting Accuracy: ', accuracy_score(y_test, gb_pred))

print('Random Forest Classification Report: ', classification_report(y_test, rf_pred))
print('AdaBoost Classification Report: ', classification_report(y_test, ada_pred))
print('Gradient Boosting Classification Report: ', classification_report(y_test, gb_pred))

Fitting 5 folds for each of 54 candidates, totalling 270 fits
Fitting 5 folds for each of 48 candidates, totalling 240 fits




Fitting 5 folds for each of 9 candidates, totalling 45 fits
Random Forest Accuracy:  0.8108108108108109
AdaBoost Accuracy:  0.8096607245543416
Gradient Boosting Accuracy:  0.8205865439907993
Random Forest Classification Report:                precision    recall  f1-score   support

       False       0.82      0.78      0.80       851
        True       0.80      0.84      0.82       888

    accuracy                           0.81      1739
   macro avg       0.81      0.81      0.81      1739
weighted avg       0.81      0.81      0.81      1739

AdaBoost Classification Report:                precision    recall  f1-score   support

       False       0.83      0.77      0.80       851
        True       0.80      0.84      0.82       888

    accuracy                           0.81      1739
   macro avg       0.81      0.81      0.81      1739
weighted avg       0.81      0.81      0.81      1739

Gradient Boosting Classification Report:                precision    recall  f1-scor

In [477]:
#Predicting unseen data
final_predictions = gb_grid_search.best_estimator_.predict(df_test)

In [478]:
#Generating submission file
final_predictions = pd.DataFrame({'PassengerId': df_test.index, 'Transported': final_predictions})
final_predictions.to_csv(filepath + 'submission.csv', index=False)