In [None]:
import pandas as pd
import sklearn as sk
import seaborn as sns
import matplotlib.pyplot as plt
import boruta as br
import numpy as np

In [None]:
#Reading in test and train datasets
filepath = './data/'
df_train = pd.read_csv(filepath + 'train.csv')
df_test = pd.read_csv(filepath + 'test.csv')

In [None]:
df_train.isna().sum()

In [None]:
#Setting up pre-processing pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn import set_config

#Telling sklearn that we want to output dataframes
set_config(transform_output='pandas')

#Defining numerical and categorical columns in df_train
cat_vars = ['HomePlanet', 'Destination', 'Cabin', 'VIP', 'Age']
num_vars = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'CryoSleep']
#Note: Age included in categorical since it is bucketized later on

In [None]:
df_train.HomePlanet.value_counts()

In [None]:
#Checking label balance
df_train.loc[df_train.HomePlanet == 'Earth'].describe(include='all')

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin


class CatVariablesAdder(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        '''Creates new categorical variables.
            starboard: whether the cabin is on the starboard side of the ship
            deck: the deck the passenger is on
        '''
        X['Starboard'] = X.apply(lambda x: 1 if x['Cabin'][-1] == 'S' else 0, axis=1)
        X['Deck'] = X['Cabin'].str[0]
        return X
    
class NumVariablesAdder(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        '''Creates new numerical variables.
            roommates: the number of roommates the passenger has
            Crew: if the passenger is a passenger or a crew member, if spending on all services is 0 and passenger is not in cryosleep
        '''
        X['roommates'] = self.columns.map(self.columns.value_counts())
        X['Crew'] = X.apply(lambda x: 1 if x['RoomService'] == 0 and x['FoodCourt'] == 0 and x['ShoppingMall'] == 0 and x['Spa'] == 0 and x['VRDeck'] == 0 and x['CryoSleep'] == 0 else 0, axis=1)
        return X

class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        '''Drops specified columns.
            columns: the columns to drop
        '''
        X.drop(self.columns, axis=1, inplace=True)
        return X
    
class Bucketizer(BaseEstimator, TransformerMixin):
    def __init__(self, column_name, bins, labels):
        self.column_name = column_name
        self.bins = bins
        self.labels = labels

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        '''Bucketizes the specified column.
            column_name: the name of the column to bucketize
            bins: the bins to use
            labels: the labels to use
        '''
        X[self.column_name] = pd.cut(X[self.column_name], bins=self.bins, labels=self.labels)
        return X
    
class SmartImputer(BaseEstimator, TransformerMixin):
    def __init__(self, condition_name, condition_value, column_names, value):
        self.condition_name = condition_name
        self.condition_value = condition_value
        self.column_names = column_names
        self.value = value

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        '''Imputes values based on a condition.
            condition_name: the name of the column to check
            condition_value: the value of the column to check
            column_names: the columns to impute
            value: the value to impute
        '''
        for col in self.column_names:
            X.loc[(X[self.condition_name] == self.condition_value), col] = self.value
        return X
    
class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, column_names, threshold):
        self.column_names = column_names
        self.threshold = threshold

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        '''Winsorizes the specified column at a percentile threshold.
            column_names: the names of the columns to winsorize
            threshold: the upper percentile threshold to winsorize at (e.g. 0.05 will winsorize at the 95th percentiles)
        '''
        for col in self.column_names:
            X[col].clip(upper = X[col].quantile(1 - self.threshold), axis=0, inplace=True).copy()
        return X


In [None]:
#Defining the data pipelines
from sklearn.pipeline import Pipeline

num_pipeline = Pipeline([
    ('new_variables_adder', NumVariablesAdder(df_train['Cabin'])),
    ('smart_imputer', SmartImputer('CryoSleep', True, ['Spa','RoomService', 'VRDeck', 'FoodCourt', 'ShoppingMall'], 0)),
    ('imputer', KNNImputer(n_neighbors=5)),
    ('outlier_remover', OutlierRemover([['Spa','RoomService', 'VRDeck', 'FoodCourt', 'ShoppingMall']], 0.05)),
    ('std_scaler', MinMaxScaler()),
])

cat_pipeline = Pipeline([
    ('bucketizer', Bucketizer('Age', bins=[0, 10, 25, 75, 100], labels=['child', 'young adult', 'adult', 'senior'])),
    ('smart_imputer_1', SmartImputer('HomePlanet', 'Earth', ['VIP'], False)),
    ('smart_imputer_2', SmartImputer('VIP', True, ['Age'], 'adult')),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('new_variables_adder', CatVariablesAdder()),
    ('column dropper', ColumnDropper('Cabin')),
    ('one_hot_encoder', OneHotEncoder(sparse_output = False)),
])

In [None]:
#Adding data pipelines to Column Transformer
from sklearn.compose import ColumnTransformer

data_pipeline = ColumnTransformer(
        ([
            ('numerical', num_pipeline, num_vars),
            ('categorical', cat_pipeline, cat_vars)
        ]), 
    verbose_feature_names_out=False
)

#Transforming the data
df_train_processed = data_pipeline.fit_transform(df_train)
df_test_processed = data_pipeline.fit_transform(df_test)

In [None]:
df_train_processed

In [None]:
df_train_processed.columns

In [None]:
#Adding modeling to pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

rf_model_pipeline = Pipeline([
    ('preprocessor', data_pipeline),
    ('model', RandomForestClassifier())
])

gb_model_pipeline = Pipeline([
    ('preprocessor', data_pipeline),
    ('model', GradientBoostingClassifier(n_estimators=100,
                                            max_depth=10,
                                            learning_rate=0.1,
                                            random_state=123,
                                            verbose=0))
])

#Defining train and test splits for preprocessed train data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_train, df_train['Transported'], test_size=0.2, random_state=123)

In [None]:
#Defining grid parameters for random forest
rf_grid_params = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [5, 10, 15],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2]
}

#Defining grid parameters for gradient boosting
gb_grid_params = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [5, 10, 15],
    'model__learning_rate': [0.1, 0.05, 0.01],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2]
}

#Defining grid search for random forest
rf_grid_search = GridSearchCV(rf_model_pipeline, rf_grid_params, cv=10, scoring='accuracy', n_jobs=-1, verbose=1)
rf_grid_search.fit(x_train, y_train)

#Defining grid search for gradient boosting
gb_grid_search = GridSearchCV(gb_model_pipeline, gb_grid_params, cv=10, scoring='accuracy', n_jobs=-1, verbose=1)
gb_grid_search.fit(x_train, y_train)


#Predicting on test data
rf_pred = rf_grid_search.predict(x_test)
gb_pred = gb_grid_search.predict(x_test)

#Evaluating the models
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print('Random Forest Accuracy: ', accuracy_score(y_test, rf_pred))
print('Gradient Boosting Accuracy: ', accuracy_score(y_test, gb_pred))

print('Random Forest Classification Report: ', classification_report(y_test, rf_pred))
print('Gradient Boosting Classification Report: ', classification_report(y_test, gb_pred))

In [None]:
#Predicting unseen data
final_predictions = gb_grid_search.best_estimator_.predict(df_test)

In [None]:
#Generating submission file
final_predictions = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Transported': final_predictions})
final_predictions.to_csv(filepath + 'submission.csv', index=False)