In [1]:
# ------------------------------------------------------------------------------------------------------------------------------------------------------------
# Data Handling and Processing
import numpy as np
import pandas as pd
import math
from sklearn.impute import KNNImputer
from sklearn.utils import shuffle
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures, PowerTransformer
# ------------------------------------------------------------------------------------------------------------------------------------------------------------

# ------------------------------------------------------------------------------------------------------------------------------------------------------------
# Visualization 
import matplotlib.pyplot as plt
import seaborn as sns
import viztoolz as viz
import mltoolz as mlt
# ------------------------------------------------------------------------------------------------------------------------------------------------------------

# ------------------------------------------------------------------------------------------------------------------------------------------------------------
# Model Selection, Metrics & Evaluation
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, ConfusionMatrixDisplay

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
# ------------------------------------------------------------------------------------------------------------------------------------------------------------

# ------------------------------------------------------------------------------------------------------------------------------------------------------------
# Pipeline Construction 
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# ------------------------------------------------------------------------------------------------------------------------------------------------------------

In [2]:
train = pd.read_csv('../data/processed/train.csv')

In [4]:
ohe_cols = ['HomePlanet','Destination','Deck','Side','CabinPosition','InGroup','VIP','CryoSleep','GroupSize']
ohe_cols_set2 = ['HomePlanet','Destination','Deck','Side','CabinPosition','InGroup','VIP','CryoSleep','GroupSize','RoomService_used','FoodCourt_used','ShoppingMall_used','Spa_used','VRDeck_used','RoomService_big_spender','FoodCourt_big_spender','ShoppingMall_big_spender','Spa_big_spender','VRDeck_big_spender, TotalSpent_big_spender']
scaler_cols = ['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','TotalSpent']
scaler_cols_set2 = ['Age','TotalSpent']

In [5]:
lr = LogisticRegression(C=1, max_iter=1000)
svc = SVC(C=1, kernel='sigmoid', gamma='scale')

rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
lgbm = LGBMClassifier(n_estimators=100, max_depth=5, verbose=-1, random_state=42)
xgbm = XGBClassifier(n_estimators=100, max_depth=5, random_state=42)
catb = CatBoostClassifier(iterations=100, depth=5, cat_features=ohe_cols, l2_leaf_reg=2)

base_algs = {'lr':lr, 'svc':svc, 'rf':rf, 'lgbm':lgbm, 'xgbm':xgbm, 'catb':catb}

In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 29 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   PassengerId               8693 non-null   object 
 1   HomePlanet                8693 non-null   object 
 2   CryoSleep                 8693 non-null   int64  
 3   Destination               8693 non-null   object 
 4   Age                       8693 non-null   float64
 5   VIP                       8693 non-null   int64  
 6   RoomService               8693 non-null   float64
 7   FoodCourt                 8693 non-null   float64
 8   ShoppingMall              8693 non-null   float64
 9   Spa                       8693 non-null   float64
 10  VRDeck                    8693 non-null   float64
 11  Transported               8693 non-null   int64  
 12  GroupSize                 8693 non-null   int64  
 13  InGroup                   8693 non-null   int64  
 14  Deck    