In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import  MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_recall_fscore_support
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.svm import SVC
import lightgbm as lgb
import xgboost as xgb


In [15]:
#Import csv

from sklearn.datasets import fetch_openml

X, y = fetch_openml(
    "titanic", version=1, as_frame=True, return_X_y=True, parser="pandas"
)
X.head()

Unnamed: 0,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [16]:
X.describe()

Unnamed: 0,pclass,age,sibsp,parch,fare,body
count,1309.0,1046.0,1309.0,1309.0,1308.0,121.0
mean,2.294882,29.881135,0.498854,0.385027,33.295479,160.809917
std,0.837836,14.4135,1.041658,0.86556,51.758668,97.696922
min,1.0,0.1667,0.0,0.0,0.0,1.0
25%,2.0,21.0,0.0,0.0,7.8958,72.0
50%,3.0,28.0,0.0,0.0,14.4542,155.0
75%,3.0,39.0,1.0,0.0,31.275,256.0
max,3.0,80.0,8.0,9.0,512.3292,328.0


In [17]:
#Looking for null values
X.isna().sum()

pclass          0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [18]:
# Name, ticket and Passenger id are basically useless 
# ( we _could_ infere something from the titles included in names, 
# but it's a story for another day)
# Cabin column (majority of null values) will be dropped, 
# as will be the 2 rows with missing embarked,
# but Age should be imputed (filling with mean?)

age_counts = X['age'].value_counts(normalize=True)
X['age'].fillna(pd.Series(np.random.choice(age_counts.index, size=len(X.index), p=age_counts.values)), inplace=True)
embarked_counts = X['embarked'].value_counts(normalize=True)
X['embarked'].fillna(pd.Series(np.random.choice(embarked_counts.index, size=len(X.index), p=embarked_counts.values)), inplace=True)
X['fare'].fillna(X['fare'].mean(), inplace=True)
X.drop('name', axis=1, inplace=True)
X.drop('ticket', axis=1, inplace=True)
X.drop('cabin', axis=1, inplace=True)
X.drop('boat', axis=1, inplace=True)
X.drop('home.dest', axis=1, inplace=True)
X.drop('body', axis=1, inplace=True)
X = pd.get_dummies(X, columns=['sibsp'])
X = pd.get_dummies(X, columns=['parch'])
X = pd.get_dummies(X, columns=['pclass'])
X = pd.get_dummies(X, columns=['embarked'])
X = pd.get_dummies(X, columns=['sex'])

In [19]:
X.isna().sum()
#Well done guys / Bravo à tous

age           0
fare          0
sibsp_0       0
sibsp_1       0
sibsp_2       0
sibsp_3       0
sibsp_4       0
sibsp_5       0
sibsp_8       0
parch_0       0
parch_1       0
parch_2       0
parch_3       0
parch_4       0
parch_5       0
parch_6       0
parch_9       0
pclass_1      0
pclass_2      0
pclass_3      0
embarked_C    0
embarked_Q    0
embarked_S    0
sex_female    0
sex_male      0
dtype: int64

In [20]:
# beside fare and age, all others variables are categorical and should be encoded
X.describe()
X.head()

Unnamed: 0,age,fare,sibsp_0,sibsp_1,sibsp_2,sibsp_3,sibsp_4,sibsp_5,sibsp_8,parch_0,...,parch_6,parch_9,pclass_1,pclass_2,pclass_3,embarked_C,embarked_Q,embarked_S,sex_female,sex_male
0,29.0,211.3375,True,False,False,False,False,False,False,True,...,False,False,True,False,False,False,False,True,True,False
1,0.9167,151.55,False,True,False,False,False,False,False,False,...,False,False,True,False,False,False,False,True,False,True
2,2.0,151.55,False,True,False,False,False,False,False,False,...,False,False,True,False,False,False,False,True,True,False
3,30.0,151.55,False,True,False,False,False,False,False,False,...,False,False,True,False,False,False,False,True,False,True
4,25.0,151.55,False,True,False,False,False,False,False,False,...,False,False,True,False,False,False,False,True,True,False


In [21]:
# scaling data
colonnes = X.columns
scaler = MinMaxScaler()
X[colonnes] = scaler.fit_transform(X)
X

Unnamed: 0,age,fare,sibsp_0,sibsp_1,sibsp_2,sibsp_3,sibsp_4,sibsp_5,sibsp_8,parch_0,...,parch_6,parch_9,pclass_1,pclass_2,pclass_3,embarked_C,embarked_Q,embarked_S,sex_female,sex_male
0,0.361169,0.412503,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,0.009395,0.295806,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,0.022964,0.295806,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,0.373695,0.295806,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,0.311064,0.295806,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,0.179540,0.028213,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
1305,0.361169,0.028213,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
1306,0.329854,0.014102,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
1307,0.336117,0.014102,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0


In [22]:
# train _ test splitting
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [23]:
y = y.cat.codes.astype('int64')

In [24]:
# Code Factorisation
X_train, X_test, y_train, y_test = train_test_split(X, y)

dico_model = { 
    GradientBoostingClassifier : {'n_estimators': [200, 500, 1000],
    'learning_rate': [0.01, 0.1, 0.005],
    'max_depth': [3, 5, 8]},
    lgb.LGBMClassifier : {'n_estimators': [200, 500, 1000],
    'learning_rate': [0.01, 0.1, 0.005],
    'max_depth': [3, 5, 8]},
    xgb.XGBClassifier : { 'n_estimators': [200, 500, 1000],
    'learning_rate': [0.01, 0.1, 0.005],
    'max_depth': [3, 5, 8]},
    LogisticRegression : {'max_iter' : [100, 200 , 500 , 1000],
    'solver' : ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']},
    SVC : {'kernel' : ['linear', 'poly','rbf', 'sigmoid']},
    Perceptron : {'max_iter' : [100, 200, 500, 1000],
    'eta0' : [0.5, 1, 1.5]}
}

def model_comp(X_train, X_test, y_train, y_test, modeldictionnary):
    
    #preformatting outputs
    output = pd.DataFrame(columns=['Accuracy', 'Precision', 'Recall', 'F1 Score'])
    estimators= dict()
    #Extracting model
    for i, (model, param) in enumerate(modeldictionnary.items()):
        #declaring model 
        mod = model()

        #grid
        grid = GridSearchCV(mod, param, cv = 5, verbose= 2 , n_jobs=-1, refit= False)
        grid.fit(X_train, y_train)

        #Extracting Best Parameters
        best_params = grid.best_params_
        #fitting model with best parameters
        best_mod = model(**best_params)
        best_mod.fit(X_train, y_train)
        y_hat= best_mod.predict(X_test)
        #computing metrics
        accuracy = accuracy_score(y_test, y_hat)
        precision = precision_score(y_test, y_hat)
        recall = recall_score(y_test, y_hat)
        f1 = f1_score(y_test, y_hat)
        metrics = [accuracy, precision, recall, f1]
        #output format
        output.loc[mod.__class__.__name__] = metrics
        #saving best estimators in a dict
        estimators[mod.__class__.__name__] = best_mod

    return output.style.highlight_max(color='darkgrey'), estimators
        
sortie_df, best_estimators = model_comp(X_train, X_test, y_train, y_test, dico_model)
sortie_df

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 27 candidates, totalling 135 fits
[LightGBM] [Info] Number of positive: 376, number of negative: 605
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000659 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 314
[LightGBM] [Info] Number of data points in the train set: 981, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383282 -> initscore=-0.475639
[LightGBM] [Info] Start training from score -0.475639
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Fitting 5 folds for each of 12 candidates, totalling 60 fits


Unnamed: 0,Accuracy,Precision,Recall,F1 Score
GradientBoostingClassifier,0.810976,0.781818,0.693548,0.735043
LGBMClassifier,0.810976,0.787037,0.685484,0.732759
XGBClassifier,0.804878,0.763158,0.701613,0.731092
LogisticRegression,0.820122,0.782609,0.725806,0.753138
SVC,0.789634,0.747748,0.669355,0.706383
Perceptron,0.695122,0.589552,0.637097,0.612403
