## Model Training

### Environment setting

In [10]:
import pickle
import os
import itertools
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from utilsML import run_experiments, expand_parameters

### Read processed data

In [11]:
cache_file = "procData.pkl"
cache_dir = "cache"
try:
    with open(os.path.join(cache_dir, cache_file), "rb") as f:
        cache_data = pickle.load(f)
    print("Read preprocessed data from cache file:", cache_file)
except:
    pass

Read preprocessed data from cache file: procData.pkl


In [12]:
df = cache_data['proc_data']
df.head(5).T

Unnamed: 0,0,1,2,3,4
PassengerId,0001_01,0002_01,0003_01,0003_02,0004_01
HomePlanet,Europa,Earth,Europa,Europa,Earth
CryoSleep,False,False,False,False,False
Cabin,B/0/P,F/0/S,A/0/S,A/0/S,F/1/S
Destination,TRAPPIST-1e,TRAPPIST-1e,TRAPPIST-1e,TRAPPIST-1e,TRAPPIST-1e
Age,0.711945,-0.334037,2.036857,0.293552,-0.891895
VIP,False,False,True,False,False
RoomService,-0.333105,-0.168073,-0.268001,-0.333105,0.125652
FoodCourt,-0.281027,-0.275387,1.959998,0.52301,-0.237159
ShoppingMall,-0.283579,-0.241771,-0.283579,0.336851,-0.031059


In [13]:
X_train = df.drop('Transported', axis=1)
y_train = df['Transported']

### Basic initial models

Initially, we will evaluate two basic models, each relying on a single predictor variable strongly linked to the target: 'CryoSleep' and 'log10TotalExpenses1'. Predictions are made directly from these inputs and metrics for the classification are calculated.

#### 'CryoSleep' feature

This 'model' achieves an accuracy of 0.72

In [14]:
le = LabelEncoder()
y_true = le.fit_transform(df['Transported'])  
y_pred = le.fit_transform(df['CryoSleep'])   

print(pd.crosstab(df['Transported'], df['CryoSleep']))
print(classification_report(y_true, y_pred))

CryoSleep    False  True 
Transported              
False         3761    554
True          1895   2483
              precision    recall  f1-score   support

           0       0.66      0.87      0.75      4315
           1       0.82      0.57      0.67      4378

    accuracy                           0.72      8693
   macro avg       0.74      0.72      0.71      8693
weighted avg       0.74      0.72      0.71      8693



#### 'log10_TotalExpenses1' feature

This 'model' uses a feature calculated from 'TotalExpenses' feature. Samples are classified into two groups based on whether the log10 of the TotalExpenses variable is greater than or less than 1. This 'model' achieves an accuracy of 0.74.

In [15]:
le = LabelEncoder()
y_true = le.fit_transform(df['Transported'])  
y_pred = le.fit_transform(df['log10_TotalExpenses1'])   

print(pd.crosstab(df['Transported'], df['log10_TotalExpenses1']))
print(classification_report(y_true, y_pred))

log10_TotalExpenses1   0.0   1.0
Transported                     
False                 3530   785
True                  1505  2873
              precision    recall  f1-score   support

           0       0.70      0.82      0.76      4315
           1       0.79      0.66      0.72      4378

    accuracy                           0.74      8693
   macro avg       0.74      0.74      0.74      8693
weighted avg       0.74      0.74      0.73      8693



### Decision trees models

Feature combinations to test are chosen according to the EDA findings.

In [16]:
onehot_homeplanet = (list(filter(lambda x: any(map(lambda y: y in x,['HomePlanet_'])), df.columns)))
main_homeplanet = list(filter(lambda x: any(map(lambda y: y not in x, ['Mars'])),onehot_homeplanet))
onehot_destination = (list(filter(lambda x: any(map(lambda y: y in x,['Destination_'])), df.columns)))
main_destination = list(filter(lambda x: any(map(lambda y: y not in x, ['PSO'])),onehot_destination))
expenses_features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
log10_expenses_features = ['log10_RoomService', 'log10_FoodCourt', 'log10_ShoppingMall', 'log10_Spa', 'log10_VRDeck']
log10_expenses_features1 = ['log10_RoomService1', 'log10_FoodCourt1', 'log10_ShoppingMall1', 'log10_Spa1', 'log10_VRDeck1']

cryosleepAge12 = ['CryoSleep_True','Age12','Deck_B','Side_S'] + main_homeplanet + main_destination
log10TE = ['log10_TotalExpenses','Deck_B','Side_S'] + main_homeplanet + main_destination
log10TE1 = ['log10_TotalExpenses1','Deck_B','Side_S'] + main_homeplanet + main_destination
expenses = expenses_features + ['Deck_B','Side_S'] + main_homeplanet + main_destination
log10Expenses = log10_expenses_features + ['Deck_B','Side_S'] + main_homeplanet + main_destination
log10Expenses1 = log10_expenses_features1 + ['Deck_B','Side_S'] + main_homeplanet + main_destination



The 'max_depth' hyperparameter is tested with various values, using all the feature combinations described above.

In [17]:
# combinations of model to check (algorithm + parameters)
dataproc = {'cryosleepAge12': cryosleepAge12, 'log10TE': log10TE, 'log10TE1': log10TE1, 
            'expenses': expenses, 'log10Expenses': log10Expenses, 'log10Expenses1': log10Expenses1} 
experiments = {
    'model': ['DecisionTree'],
    'algorithms': [DecisionTreeClassifier(criterion='gini')],
    'parameters': [{'max_depth': np.arange(2, 15)}]
}
results = run_experiments(dataproc, experiments, X_train, y_train)

The highest performance is achieved with the 'expenses', 'log10Expenses', and 'log10Expenses1' datasets.

In [18]:
max_indices = pd.DataFrame(results).drop(['features'],axis=1).groupby('feature_name')['test_score'].idxmax()
df_max = pd.DataFrame(results).drop(['features'],axis=1).loc[max_indices]
expand_parameters(df_max)

Unnamed: 0,feature_name,model,train_score,test_score,fit_time,max_depth
5,cryosleepAge12,DecisionTree,0.737145,0.734502,0.004797,7
43,expenses,DecisionTree,0.797193,0.77603,0.015629,6
56,log10Expenses,DecisionTree,0.797193,0.775915,0.015902,6
69,log10Expenses1,DecisionTree,0.759951,0.754288,0.006161,6
13,log10TE,DecisionTree,0.737145,0.737148,0.004703,2
26,log10TE1,DecisionTree,0.73657,0.736573,0.003835,2


It is reduced to the allowed range of values for 'max_depth' and tested with the hyperparameters 'min_samples_leaf', 'min_samples_split', and 'splitter'.

In [19]:
# combinations of model to check (algorithm + parameters)
dataproc = {'cryosleepAge12': cryosleepAge12, 'log10TE': log10TE, 'log10TE1': log10TE1, 
            'expenses': expenses, 'log10Expenses': log10Expenses, 'log10Expenses1': log10Expenses1} 
experiments = {
    'model': ['DecisionTree'],
    'algorithms': [DecisionTreeClassifier(criterion='gini')],
    'parameters': [{'max_depth': np.arange(2, 8),
                    'min_samples_leaf': np.arange(2,10),
                    'min_samples_split': np.arange(2,10),
                    'splitter': ['best','random']}]
}
results = run_experiments(dataproc, experiments, X_train, y_train)

When optimization of more hyperparameters is allowed, better score values are achieved. The best values still appear for the feature sets 'expenses', 'log10Expenses', and 'log10Expenses1'.

In [20]:
max_indices = pd.DataFrame(results).drop(['features'],axis=1).groupby('feature_name')['test_score'].idxmax()
df_max = pd.DataFrame(results).drop(['features'],axis=1).loc[max_indices]

expand_parameters(df_max)

Unnamed: 0,feature_name,model,train_score,test_score,fit_time,max_depth,min_samples_leaf,min_samples_split,splitter
576,cryosleepAge12,DecisionTree,0.736886,0.735307,0.005201,6,6,2,best
2848,expenses,DecisionTree,0.796474,0.77695,0.015259,6,4,2,best
3779,log10Expenses,DecisionTree,0.788853,0.781551,0.007156,7,6,3,random
4464,log10Expenses1,DecisionTree,0.759117,0.754748,0.005925,6,9,2,best
1275,log10TE,DecisionTree,0.737145,0.738873,0.004692,5,9,7,random
1536,log10TE1,DecisionTree,0.73657,0.736573,0.003669,2,2,2,best


In [None]:
# TODO Why 'log10TE1' 2, 2, 2???

Peak scores per dataset:

- 'expense': max_depth=6, min_samples_leaf=4 and any min_samples_split

- 'log10Expenses1': max_depth=6, min_samples_leaf=9 and any min_samples_split

- 'log10Expenses': max_depth=7, min_samples_leaf=7, min_samples_split=3

In [34]:
df = pd.DataFrame(results).drop(['features'], axis=1)
df_sel = df[df['feature_name'].isin(['expenses','log10Expenses','log10Expenses1'])]
df_max3 = df_sel.sort_values(['feature_name', 'test_score'], ascending=[True, False]).groupby('feature_name').head(3)

expand_parameters(df_max3)

Unnamed: 0,feature_name,model,train_score,test_score,fit_time,max_depth,min_samples_leaf,min_samples_split,splitter
2848,expenses,DecisionTree,0.796474,0.77695,0.015259,6,4,2,best
2850,expenses,DecisionTree,0.796474,0.77695,0.015286,6,4,3,best
2852,expenses,DecisionTree,0.796474,0.77695,0.016735,6,4,4,best
3779,log10Expenses,DecisionTree,0.788853,0.781551,0.007156,7,6,3,random
3801,log10Expenses,DecisionTree,0.78707,0.78063,0.00785,7,7,6,random
3703,log10Expenses,DecisionTree,0.782095,0.778099,0.006432,6,9,5,random
4464,log10Expenses1,DecisionTree,0.759117,0.754748,0.005925,6,9,2,best
4465,log10Expenses1,DecisionTree,0.759117,0.754748,0.006153,6,9,2,random
4466,log10Expenses1,DecisionTree,0.759117,0.754748,0.006017,6,9,3,best


In [40]:
df = pd.DataFrame(results).drop(['features'], axis=1)
df_sel = df[df['feature_name'].isin(['expenses'])]
df_max10 = df_sel.sort_values(['test_score'], ascending=[False]).head(10)

expand_parameters(df_max10)

Unnamed: 0,feature_name,model,train_score,test_score,fit_time,max_depth,min_samples_leaf,min_samples_split,splitter
2850,expenses,DecisionTree,0.796474,0.77695,0.015286,6,4,3,best
2852,expenses,DecisionTree,0.796474,0.77695,0.016735,6,4,4,best
2854,expenses,DecisionTree,0.796474,0.77695,0.016244,6,4,5,best
2856,expenses,DecisionTree,0.796474,0.77695,0.016251,6,4,6,best
2858,expenses,DecisionTree,0.796474,0.77695,0.016141,6,4,7,best
2860,expenses,DecisionTree,0.796474,0.77695,0.015325,6,4,8,best
2862,expenses,DecisionTree,0.796474,0.77695,0.015268,6,4,9,best
2848,expenses,DecisionTree,0.796474,0.77695,0.015259,6,4,2,best
2834,expenses,DecisionTree,0.79659,0.77672,0.015328,6,3,3,best
2832,expenses,DecisionTree,0.79659,0.77672,0.015479,6,3,2,best


In the previous calculation, some of the best models reached the maximum allowed value for 'min_samples_leaf'. We now test with an extended range for this parameter while reducing the ranges of other hyperparameters to avoid excessive computation time.

In [42]:
# combinations of model to check (algorithm + parameters)
dataproc = {'log10Expenses': log10Expenses, 'log10Expenses1': log10Expenses1} 
experiments = {
    'model': ['DecisionTree'],
    'algorithms': [DecisionTreeClassifier(criterion='gini')],
    'parameters': [{'max_depth': np.arange(5, 8),
                    'min_samples_leaf': np.arange(5,15),
                    'min_samples_split': np.arange(2,10),
                    'splitter': ['best','random']}]
}
results = run_experiments(dataproc, experiments, X_train, y_train)

In [45]:
df = pd.DataFrame(results).drop(['features'],axis=1)
df_max3 = df.sort_values(['feature_name', 'test_score'], ascending=[True, False]).groupby('feature_name').head(3)
expand_parameters(df_max3)

Unnamed: 0,feature_name,model,train_score,test_score,fit_time,max_depth,min_samples_leaf,min_samples_split,splitter
455,log10Expenses,DecisionTree,0.785891,0.779479,0.006959,7,13,5,random
343,log10Expenses,DecisionTree,0.787387,0.778905,0.007002,7,6,5,random
403,log10Expenses,DecisionTree,0.78546,0.778674,0.00696,7,10,3,random
704,log10Expenses1,DecisionTree,0.759117,0.754748,0.006051,6,9,2,best
705,log10Expenses1,DecisionTree,0.759117,0.754748,0.006462,6,9,2,random
706,log10Expenses1,DecisionTree,0.759117,0.754748,0.006054,6,9,3,best


In [11]:
# TODO Check above results
# TODO Video Imputation Kaggle
# TODO COmplete FeatureEngineer
# TODO Random Forest
# TODO Feature importance
# TODO Boosted Tree sklearn
# TODO XGBoost
# TODO HistGradienteBoostingClassifier

In [95]:
df_results = pd.DataFrame(results).drop(['features'],axis=1)
df_results[df_results['feature_name'] == 'log10TE']

Unnamed: 0,feature_name,model,parameters,train_score,test_score,fit_time
416,log10TE,DecisionTree,"{'max_depth': 2, 'min_samples_split': 2, 'min_...",0.737145,0.737148,0.004820
417,log10TE,DecisionTree,"{'max_depth': 2, 'min_samples_split': 2, 'min_...",0.722709,0.716665,0.003752
418,log10TE,DecisionTree,"{'max_depth': 2, 'min_samples_split': 2, 'min_...",0.737145,0.737148,0.004792
419,log10TE,DecisionTree,"{'max_depth': 2, 'min_samples_split': 2, 'min_...",0.732256,0.734388,0.003861
420,log10TE,DecisionTree,"{'max_depth': 2, 'min_samples_split': 2, 'min_...",0.737145,0.737148,0.004703
...,...,...,...,...,...,...
827,log10TE,DecisionTree,"{'max_depth': 14, 'min_samples_split': 8, 'min...",0.740366,0.729556,0.004864
828,log10TE,DecisionTree,"{'max_depth': 14, 'min_samples_split': 8, 'min...",0.763488,0.711149,0.010648
829,log10TE,DecisionTree,"{'max_depth': 14, 'min_samples_split': 8, 'min...",0.739820,0.731855,0.004795
830,log10TE,DecisionTree,"{'max_depth': 14, 'min_samples_split': 8, 'min...",0.763459,0.711379,0.010674


In [57]:
best_model = pd.DataFrame(results).sort_values('acc_val', ascending=False).iloc[0,:].to_dict()
for key, value in best_model.items():
    print(f'{key}: {value}')

features: ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Destination_55 Cancri e', 'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars', 'CryoSleep_True']
algorithm: DecisionTreeClassifier(splitter='random')
parameters: {'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 4}
acc_val: 0.7839655609383592


In [None]:
# features for Room Type One Hot Encoding 
expenses_features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
sel_features_cat = list(filter(lambda x: any(map(lambda y: y in x,['CryoSleep','HomePlanet', 'Destination'])), features_onehot))
features_onehot_wo_deck = list(filter(lambda x: 'Deck' not in x,list(features_onehot)))
onehot_deck_all = list(filter(lambda x: any(map(lambda y: y in x,['Deck'])), features_onehot))
onehot_deckB = list(filter(lambda x: ('Deck' not in x) or (x == 'Deck_B'), features_onehot))
onehot_deckBCEF = ['Deck_B','Deck_C','Deck_E','Deck_F','Deck_notBCEF']

# All features (drop Num and Name) [expenses features/total expenses] 
all_features_expenses = ['Age'] + expenses_features + list(features_onehot) 
all_features_totalexpenses = ['Age', 'TotalExpenses'] + list(features_onehot)
# Selection CryoSleep, HomePlanet, Deck, Destination, expenses features [expenses features/total expenses]
sel_features_expenses = expenses_features + sel_features_cat
sel_features_totalexpenses = ['TotalExpenses'] + sel_features_cat

# combinations of features to check 
#dataproc = [all_features_expenses, all_features_totalexpenses, sel_features_expenses, sel_features_totalexpenses]
dataproc = {'all_expenses': all_features_expenses, 'all_totalexpenses': all_features_totalexpenses, 
             'sel_expenses': sel_features_expenses, 'sel_totalexpenses': sel_features_totalexpenses}

In [40]:
# combinations of model to check (algorithm + parameters)
experiments = {
    'model': ['KNN', 'Ridge', 'DecisionTree', 'BaggingRidge', 'RandomForest']
    'algorithms': [KNeighborsClassifier(), RidgeClassifier(), 
                    DecisionTreeClassifier(criterion='gini'),
                    BaggingClassifier(estimator=RidgeClassifier())
                    RandomForestClassifier()],
    'parameters': [{'n_neighbors': [8, 10, 12, 14], 'weights': ['uniform', 'distance']},
                    {'alpha': [1.0]},
                    {'max_depth': [5, 10, 15], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], splitter['best', 'random']},
                    {'n_estimators': [10, 30, 50], 'max_features': [1.0, 0.7, 0.5], 'max_samples': [1.0, 0.9, 0.8]}
                    {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 15], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}
                  ]
}

#experiments = {
#    'algorithms': [KNeighborsClassifier(), RidgeClassifier()],
#    'parameters': [{'n_neighbors': [14, 18], 'weights': ['uniform', 'distance']},
#                    {'alpha': [1.0]}
#                  ]
#}
#experiments = {
#    'model': ['KNN', 'Ridge'],
#    'algorithm': [KNeighborsClassifier(), RidgeClassifier()],
#    'parameters': [{'n_neighbors': [14, 18], 'weights': ['uniform', 'distance']},
#                    {'alpha': [1.0]}
#                  ]
#}
# run experiments: features + algorithm + parameters
results = []
for dp, features in dataproc.items():
    X_train_sel = X_train[features]    
    #for index, algorithm in enumerate(experiments['algorithms']):
    for index, model in enumerate(experiments['model']):
        params = experiments['parameters'][index]
        algorithm = experiments['algorithm'][index]
        grid = GridSearchCV(algorithm, param_grid = params, cv = 5, 
                            scoring= 'accuracy', return_train_score = True)
        grid.fit(X_train_sel, y_train)

        lst = list(grid.get_params()['param_grid'].values())
        #for params, mse in zip(itertools.product(*lst),grid.cv_results_['mean_test_score']):
        for params, test_score, train_score, fit_time in zip(itertools.product(*lst),
                                                   grid.cv_results_['mean_test_score'], 
                                                   grid.cv_results_['mean_train_score'],
                                                   grid.cv_results_['mean_fit_time']):
            params_dict = {param: value for param, value in zip(grid.get_params()['param_grid'].keys(), params)}
            result = {'feature_name': dp,'features': features, 'model': model, 'parameters': params_dict,
                  'train_score': train_score, 'test_score': test_score, 'fit_time': fit_time}
            results.append(result)


SyntaxError: invalid syntax. Perhaps you forgot a comma? (1087576589.py, line 3)

In [32]:
pd.DataFrame(results).drop(['features'],axis=1)
max_indices = pd.DataFrame(results).drop(['features'],axis=1).groupby('model')['test_score'].idxmax()
pd.DataFrame(results).drop(['features'],axis=1).loc[max_indices]


Unnamed: 0,feature_name,model,parameters,train_score,test_score,fit_time
2,all_expenses,KNN,"{'n_neighbors': 18, 'weights': 'uniform'}",0.808754,0.780285,0.004548
4,all_expenses,Ridge,{'alpha': 1.0},0.769642,0.767745,0.007219


In [57]:
best_model = pd.DataFrame(results).sort_values('acc_val', ascending=False).iloc[0,:].to_dict()
for key, value in best_model.items():
    print(f'{key}: {value}')

features: ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Destination_55 Cancri e', 'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars', 'CryoSleep_True']
algorithm: DecisionTreeClassifier(splitter='random')
parameters: {'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 4}
acc_val: 0.7839655609383592
