## Voting Model

In this section I combine the best performing models according to PyCaret (LightGBM, Logistic Regression, and Random Forest)


add main folder to the path to access utility functions


In [1]:
import sys
import os

parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
sys.path.append(parent_dir)

import modules


In [2]:
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import VotingClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

from utils.machine_learning import Rounder

In [3]:
train_data = pd.read_pickle("../../data/train_processed.pkl")
train_data

Unnamed: 0,PassengerNum,Age,HomePlanet,Destination,CabinDeck,CabinSide,CryoSleep,VIP,RoomService,FoodCourt,...,YesShoppingMall,YesSpa,YesVRDeck,YesTotalSpending,LogRoomService,LogFoodCourt,LogShoppingMall,LogSpa,LogVRDeck,LogTotalSpending
0,01,39.0,Europa,TRAPPIST-1e,B,P,False,False,0.0,0.0,...,False,False,False,False,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,01,24.0,Earth,TRAPPIST-1e,F,S,False,False,109.0,9.0,...,True,True,True,True,4.700480,2.302585,3.258097,6.309918,3.806662,6.602588
2,01,58.0,Europa,TRAPPIST-1e,A,S,False,True,43.0,3576.0,...,False,True,True,True,3.784190,8.182280,0.000000,8.812248,3.912023,9.248021
3,02,33.0,Europa,TRAPPIST-1e,A,S,False,False,0.0,1283.0,...,True,True,True,True,0.000000,7.157735,5.918894,8.110728,5.267858,8.551981
4,01,16.0,Earth,TRAPPIST-1e,F,S,False,False,303.0,70.0,...,True,True,True,True,5.717028,4.262680,5.023881,6.338594,1.098612,6.995766
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,01,41.0,Europa,55 Cancri e,A,P,False,True,0.0,6819.0,...,False,True,True,True,0.000000,8.827615,0.000000,7.404888,4.317488,9.052165
8689,01,18.0,Earth,PSO J318.5-22,G,S,True,False,0.0,0.0,...,False,False,False,False,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8690,01,26.0,Earth,TRAPPIST-1e,G,S,False,False,0.0,0.0,...,True,True,False,True,0.000000,0.000000,7.535297,0.693147,0.000000,7.535830
8691,01,32.0,Europa,55 Cancri e,E,S,False,False,0.0,1049.0,...,False,True,True,True,0.000000,6.956545,0.000000,5.869297,8.082093,8.442039


In [4]:
lgbm_pipeline = pd.read_pickle("../../ML_models_trained/light_gbm_model.pkl")
rf_pipeline = pd.read_pickle("../../ML_models_trained/rf_model.pkl")
log_reg_pipeline = pd.read_pickle("../../ML_models_trained/log_reg_model.pkl")

In [5]:
df = train_data.copy()
X = df.drop(
    columns=[
        "Transported",
        "TotalSpending",
        "LogRoomService",
        "LogFoodCourt",
        "LogShoppingMall",
        "LogSpa",
        "LogVRDeck",
        "LogTotalSpending",
        "LogTotalSpending",
    ]
)
y = df["Transported"]
numerical_columns = list(X.select_dtypes(include="number").drop(columns="CabinBin"))
categorical_columns = list(X.select_dtypes(include=["object"]))

In [6]:
voting_model = VotingClassifier(
    estimators=[
        ("log_reg", log_reg_pipeline),
        ("rf", rf_pipeline),
        ("lgbm", lgbm_pipeline),
    ]
)

voting_model.fit(X, y)



[LightGBM] [Info] Number of positive: 4378, number of negative: 4315
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000830 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1517
[LightGBM] [Info] Number of data points in the train set: 8693, number of used features: 66
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503624 -> initscore=0.014495
[LightGBM] [Info] Start training from score 0.014495


In [7]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
cross_val_score(
    voting_model, X, y, scoring="accuracy", n_jobs=-1, cv=5, error_score="raise"
).mean()

[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002521 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1509
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 66
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503595 -> initscore=0.014380
[LightGBM] [Info] Start training from score 0.014380




[LightGBM] [Info] Number of positive: 3503, number of negative: 3452
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002365 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1512
[LightGBM] [Info] Number of data points in the train set: 6955, number of used features: 66
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503666 -> initscore=0.014666
[LightGBM] [Info] Start training from score 0.014666
[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000964 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1514
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 66
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503595 -> initscore=0.014380
[LightGBM] 

0.8013365616920695

In [8]:
test_data = pd.read_pickle("../../data/test_processed.pkl")
test_data

Unnamed: 0,PassengerNum,Age,HomePlanet,Destination,CabinDeck,CabinSide,CryoSleep,VIP,RoomService,FoodCourt,...,YesShoppingMall,YesSpa,YesVRDeck,YesTotalSpending,LogRoomService,LogFoodCourt,LogShoppingMall,LogSpa,LogVRDeck,LogTotalSpending
0,01,27.0,Earth,TRAPPIST-1e,G,S,True,False,0.0,0.0,...,False,False,False,False,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,01,19.0,Earth,TRAPPIST-1e,F,S,False,False,0.0,9.0,...,False,True,False,True,0.000000,2.302585,0.000000,7.945910,0.000000,7.949091
2,01,31.0,Europa,55 Cancri e,C,S,True,False,0.0,0.0,...,False,False,False,False,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,01,38.0,Europa,TRAPPIST-1e,C,S,False,False,0.0,6652.0,...,False,True,True,True,0.000000,8.802823,0.000000,5.204007,6.373320,8.911800
4,01,20.0,Earth,TRAPPIST-1e,F,S,False,False,10.0,0.0,...,True,False,False,True,2.397895,0.000000,6.455199,0.000000,0.000000,6.470800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,02,34.0,Earth,TRAPPIST-1e,G,S,True,False,0.0,0.0,...,False,False,False,False,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4273,01,42.0,Earth,TRAPPIST-1e,Unknown,Unknown,False,False,0.0,847.0,...,True,True,True,True,0.000000,6.742881,2.890372,2.397895,4.976734,6.926577
4274,01,,Mars,55 Cancri e,D,P,True,False,0.0,0.0,...,False,False,False,False,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4275,01,,Europa,Unknown,D,P,False,False,0.0,2680.0,...,False,False,True,True,0.000000,7.893945,0.000000,0.000000,6.261492,8.072155


In [9]:
pass_id = pd.read_csv(
    "../../data/test.csv",
    usecols=[0],
)
pass_id

Unnamed: 0,PassengerId
0,0013_01
1,0018_01
2,0019_01
3,0021_01
4,0023_01
...,...
4272,9266_02
4273,9269_01
4274,9271_01
4275,9273_01


In [10]:
preds = voting_model.predict(test_data)



In [11]:
submission = pd.DataFrame(
    {"PassengerId": pass_id["PassengerId"].values, "Transported": preds}
)

In [12]:
submission.to_csv(
    "../../data/submission.csv",
    index=False,
)

In [13]:
import pickle

model = voting_model
model.fit(X, y)

with open(
    "../../deployment/app/model.pkl",
    "wb",
) as file:
    pickle.dump(model, file)



[LightGBM] [Info] Number of positive: 4378, number of negative: 4315
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000846 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1517
[LightGBM] [Info] Number of data points in the train set: 8693, number of used features: 66
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503624 -> initscore=0.014495
[LightGBM] [Info] Start training from score 0.014495


get test data for app deployment

In [14]:
X

Unnamed: 0,PassengerNum,Age,HomePlanet,Destination,CabinDeck,CabinSide,CryoSleep,VIP,RoomService,FoodCourt,...,VIPMissing,PartySize,FamilyGroupMember,CabinBin,YesRoomService,YesFoodCourt,YesShoppingMall,YesSpa,YesVRDeck,YesTotalSpending
0,01,39.0,Europa,TRAPPIST-1e,B,P,False,False,0.0,0.0,...,False,1.0,False,0.0,False,False,False,False,False,False
1,01,24.0,Earth,TRAPPIST-1e,F,S,False,False,109.0,9.0,...,False,1.0,False,0.0,True,True,True,True,True,True
2,01,58.0,Europa,TRAPPIST-1e,A,S,False,True,43.0,3576.0,...,False,2.0,True,0.0,True,True,False,True,True,True
3,02,33.0,Europa,TRAPPIST-1e,A,S,False,False,0.0,1283.0,...,False,2.0,True,0.0,False,True,True,True,True,True
4,01,16.0,Earth,TRAPPIST-1e,F,S,False,False,303.0,70.0,...,False,1.0,False,0.0,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,01,41.0,Europa,55 Cancri e,A,P,False,True,0.0,6819.0,...,False,1.0,False,0.0,False,True,False,True,True,True
8689,01,18.0,Earth,PSO J318.5-22,G,S,True,False,0.0,0.0,...,False,1.0,False,4.0,False,False,False,False,False,False
8690,01,26.0,Earth,TRAPPIST-1e,G,S,False,False,0.0,0.0,...,False,1.0,False,4.0,False,False,True,True,False,True
8691,01,32.0,Europa,55 Cancri e,E,S,False,False,0.0,1049.0,...,False,2.0,True,2.0,False,True,False,True,True,True


In [15]:
X.dtypes

PassengerNum           object
Age                   float64
HomePlanet             object
Destination            object
CabinDeck              object
CabinSide              object
CryoSleep              object
VIP                    object
RoomService           float64
FoodCourt             float64
ShoppingMall          float64
Spa                   float64
VRDeck                float64
NameMissing            object
HomeMissing            object
DestinationMissing     object
CabinMissing           object
CryoMissing            object
VIPMissing             object
PartySize             float64
FamilyGroupMember      object
CabinBin              float64
YesRoomService         object
YesFoodCourt           object
YesShoppingMall        object
YesSpa                 object
YesVRDeck              object
YesTotalSpending       object
dtype: object

In [17]:
import pickle 

test_data = X.to_dict(orient='records')[0]
filename = '../../deployment/app/test_data.pkl'
with open(filename, 'wb') as file:
    pickle.dump(test_data, file)