In [38]:
import pandas as pd
import sklearn as sk
import seaborn as sns
import matplotlib.pyplot as plt
import boruta as br
import numpy as np

In [39]:
#Reading in test and train datasets
filepath = './data/'
df_train = pd.read_csv(filepath + 'train.csv')
df_test = pd.read_csv(filepath + 'test.csv')

In [40]:
#Setting up pre-processing pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn import set_config

#Telling sklearn that we want to output dataframes
set_config(transform_output='pandas')

#Defining numerical and categorical columns in df_train
cat_vars = ['HomePlanet', 'CryoSleep', 'Destination', 'Cabin', 'VIP']
num_vars = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [41]:
#Checking label balance
df_train['Transported'].value_counts()

True     4378
False    4315
Name: Transported, dtype: int64

In [42]:
from sklearn.base import BaseEstimator, TransformerMixin


class CatVariablesAdder(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        '''Creates new categorical variables.
            starboard: whether the cabin is on the starboard side of the ship
            deck: the deck the passenger is on
        '''
        X['Starboard'] = X.apply(lambda x: 1 if x['Cabin'][-1] == 'S' else 0, axis=1)
        X['Deck'] = X['Cabin'].str[0]
        return X
    
class NumVariablesAdder(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        '''Creates new numerical variables.
            roommates: the number of roommates the passenger has
            Crew: if the passenger is a passenger or a crew member, if spending on all services is 0
        '''
        X['roommates'] = self.columns.map(self.columns.value_counts())
        X['Crew'] = X.apply(lambda x: 1 if x['RoomService'] == 0 and x['FoodCourt'] == 0 and x['ShoppingMall'] == 0 and x['Spa'] == 0 and x['VRDeck'] == 0 else 0, axis=1)
        return X

class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X.drop(self.columns, axis=1, inplace=True)
        return X
    
class Bucketizer(BaseEstimator, TransformerMixin):
    def __init__(self, column_name, bins, labels):
        self.column_name = column_name
        self.bins = bins
        self.labels = labels

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X[self.column_name] = pd.cut(X[self.column_name], bins=self.bins, labels=self.labels)
        return X


In [43]:
#Defining the data pipelines
from sklearn.pipeline import Pipeline

num_pipeline = Pipeline([
    ('new_variables_adder', NumVariablesAdder(df_train['Cabin'])),
    ('imputer', KNNImputer(n_neighbors=5)),
    ('std_scaler', StandardScaler()),
    ('bucketizer', Bucketizer('Age', bins=[0, 0.25, 0.5, 0.75, 1], labels=['child', 'young adult', 'adult', 'senior'])),
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('new_variables_adder', CatVariablesAdder()),
    ('column dropper', ColumnDropper('Cabin')),
    ('one_hot_encoder', OneHotEncoder(sparse_output = False)),
])

In [44]:
#Adding data pipelines to Column Transformer
from sklearn.compose import ColumnTransformer

data_pipeline = ColumnTransformer(
        ([
            ('numerical', num_pipeline, num_vars),
            ('categorical', cat_pipeline, cat_vars)
        ]), 
    verbose_feature_names_out=False
)

#Transforming the data
df_train_processed = data_pipeline.fit_transform(df_train)
df_test_processed = data_pipeline.fit_transform(df_test)

In [45]:
df_train_processed

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,roommates,Crew,HomePlanet_Earth,HomePlanet_Europa,...,Starboard_0,Starboard_1,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T
0,adult,-0.339869,-0.284017,-0.289721,-0.274302,-0.266493,-0.591329,1.295083,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,,-0.175143,-0.278381,-0.247933,0.213292,-0.227708,-0.591329,-0.772151,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,,-0.274885,1.955269,-0.289721,5.689621,-0.223301,0.160679,-0.772151,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,young adult,-0.339869,0.519395,0.330405,2.682348,-0.096368,0.160679,-0.772151,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,,0.118038,-0.240183,-0.037325,0.227503,-0.264730,-0.591329,-0.772151,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,senior,-0.339869,3.986030,-0.289721,1.184928,-0.201264,-0.591329,-0.772151,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8689,,-0.339869,-0.284017,-0.289721,-0.274302,-0.266493,-0.591329,1.295083,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8690,,-0.339869,-0.284017,2.839322,-0.273414,-0.266493,-0.591329,-0.772151,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8691,child,-0.339869,0.372865,-0.289721,0.039215,2.585078,0.160679,-0.772151,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [46]:
#Adding modeling to pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

rf_model_pipeline = Pipeline([
    ('preprocessor', data_pipeline),
    ('model', RandomForestClassifier())
])

gb_model_pipeline = Pipeline([
    ('preprocessor', data_pipeline),
    ('model', GradientBoostingClassifier(n_estimators=100,
                                            max_depth=10,
                                            learning_rate=0.1,
                                            random_state=123,
                                            verbose=0))
])

#Defining train and test splits for preprocessed train data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_train, df_train['Transported'], test_size=0.2, random_state=123)

In [47]:
#Defining grid parameters for random forest
rf_grid_params = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [5, 10],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

#Defining grid parameters for gradient boosting
gb_grid_params = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [5, 10, 15],
    'model__learning_rate': [0.1, 0.05, 0.01],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2]
}

#Defining grid parameters for adaboost
ada_grid_params = {
    'model__n_estimators': [100, 200, 300],
    'model__learning_rate': [0.1, 0.05, 0.01]
}

#Defining grid search for random forest
rf_grid_search = GridSearchCV(rf_model_pipeline, rf_grid_params, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
rf_grid_search.fit(x_train, y_train)

#Defining grid search for gradient boosting
gb_grid_search = GridSearchCV(gb_model_pipeline, gb_grid_params, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
gb_grid_search.fit(x_train, y_train)


#Predicting on test data
rf_pred = rf_grid_search.predict(x_test)
gb_pred = gb_grid_search.predict(x_test)

#Evaluating the models
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print('Random Forest Accuracy: ', accuracy_score(y_test, rf_pred))
print('Gradient Boosting Accuracy: ', accuracy_score(y_test, gb_pred))

print('Random Forest Classification Report: ', classification_report(y_test, rf_pred))
print('Gradient Boosting Classification Report: ', classification_report(y_test, gb_pred))

Fitting 5 folds for each of 54 candidates, totalling 270 fits




In [None]:
#Predicting unseen data
final_predictions = gb_grid_search.best_estimator_.predict(df_test)

In [None]:
#Generating submission file
final_predictions = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Transported': final_predictions})
final_predictions.to_csv(filepath + 'submission.csv', index=False)