In [321]:
import pandas as pd
import sklearn as sk
import seaborn as sns
import matplotlib.pyplot as plt
import boruta as br
import numpy as np

In [322]:
#Reading in test and train datasets
filepath = './data/'
df_train = pd.read_csv(filepath + 'train.csv')
df_test = pd.read_csv(filepath + 'test.csv')

In [323]:
#Setting up pre-processing pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn import set_config

#Telling sklearn that we want to output dataframes
set_config(transform_output='pandas')

#Defining numerical and categorical columns in df_train
cat_vars = ['HomePlanet', 'CryoSleep', 'Destination', 'Cabin', 'VIP']
num_vars = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [324]:
from sklearn.base import BaseEstimator, TransformerMixin


class CatVariablesAdder(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        '''Creates new categorical variables.
            starboard: whether the cabin is on the starboard side of the ship
            deck: the deck the passenger is on
        '''
        X['starboard'] = X.apply(lambda x: 1 if x['Cabin'][-1] == 'S' else 0, axis=1)
        X['deck'] = X['Cabin'].str[0]
        return X
    
class NumVariablesAdder(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        '''Creates new numerical variables.
            roommates: the number of roommates the passenger has
        '''
        X['roommates'] = self.columns.map(self.columns.value_counts())
        return X

class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X.drop(self.columns, axis=1, inplace=True)
        return X

In [325]:
#Defining the data pipelines
from sklearn.pipeline import Pipeline

num_pipeline = Pipeline([
    ('imputer', KNNImputer(n_neighbors=5)),
    ('new_variables_adder', NumVariablesAdder(df_train['Cabin'])),
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('new_variables_adder', CatVariablesAdder()),
    ('column dropper', ColumnDropper('Cabin')),
    ('one_hot_encoder', OneHotEncoder(sparse_output = False)),
])

In [326]:
#Adding data pipelines to Column Transformer
from sklearn.compose import ColumnTransformer

data_pipeline = ColumnTransformer([
    ('numerical', num_pipeline, num_vars),
    ('categorical', cat_pipeline, cat_vars)
])

#Transforming the data
df_train_processed = data_pipeline.fit_transform(df_train)
df_test_processed = data_pipeline.fit_transform(df_test)

In [None]:
#Adding modeling to pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

rf_model_pipeline = Pipeline([
    ('preprocessor', data_pipeline),
    ('model', RandomForestClassifier(n_estimators=100, 
                                      max_depth=10, 
                                      class_weight='balanced', 
                                      random_state=123, 
                                      verbose=0))
])

ada_model_pipeline = Pipeline([
    ('preprocessor', data_pipeline),
    ('model', AdaBoostClassifier(n_estimators=100,
                                        learning_rate=0.1,
                                        random_state=123,
                                        verbose=0))
])

gb_model_pipeline = Pipeline([
    ('preprocessor', data_pipeline),
    ('model', GradientBoostingClassifier(n_estimators=100,
                                            max_depth=10,
                                            learning_rate=0.1,
                                            random_state=123,
                                            verbose=0))
])
