## Model Training

### Environment setting

In [17]:
import pickle
import os
import itertools
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from utilsML import run_experiments, expand_parameters, time_gridsearch, constrained_combinations

### Read processed data

In [18]:
cache_file = "procData.pkl"
cache_dir = "cache"
try:
    with open(os.path.join(cache_dir, cache_file), "rb") as f:
        cache_data = pickle.load(f)
    print("Read preprocessed data from cache file:", cache_file)
except:
    pass

Read preprocessed data from cache file: procData.pkl


In [19]:
df_train = cache_data['proc_data']
df_train.head(5).T

Unnamed: 0,0,1,2,3,4
PassengerId,0001_01,0002_01,0003_01,0003_02,0004_01
HomePlanet,Europa,Earth,Europa,Europa,Earth
CryoSleep,False,False,False,False,False
Cabin,B/0/P,F/0/S,A/0/S,A/0/S,F/1/S
Destination,TRAPPIST-1e,TRAPPIST-1e,TRAPPIST-1e,TRAPPIST-1e,TRAPPIST-1e
Age,0.711945,-0.334037,2.036857,0.293552,-0.891895
VIP,False,False,True,False,False
RoomService,-0.333105,-0.168073,-0.268001,-0.333105,0.125652
FoodCourt,-0.281027,-0.275387,1.959998,0.52301,-0.237159
ShoppingMall,-0.283579,-0.241771,-0.283579,0.336851,-0.031059


In [20]:
X_train = df_train.drop('Transported', axis=1)
y_train = df_train['Transported']

### Basic initial models

Initially, we will evaluate two basic models, each relying on a single predictor variable strongly linked to the target: 'CryoSleep' and 'log10TotalExpenses1'. Predictions are made directly from these inputs and metrics for the classification are calculated.

#### 'CryoSleep' feature

This 'model' achieves an accuracy of 0.72

In [5]:
le = LabelEncoder()
y_true = le.fit_transform(df_train['Transported'])  
y_pred = le.fit_transform(df_train['CryoSleep'])   

print(pd.crosstab(df_train['Transported'], df_train['CryoSleep']))
print(classification_report(y_true, y_pred))

CryoSleep    False  True 
Transported              
False         3761    554
True          1895   2483
              precision    recall  f1-score   support

           0       0.66      0.87      0.75      4315
           1       0.82      0.57      0.67      4378

    accuracy                           0.72      8693
   macro avg       0.74      0.72      0.71      8693
weighted avg       0.74      0.72      0.71      8693



#### 'log10_TotalExpenses1' feature

This 'model' uses a feature calculated from 'TotalExpenses' feature. Samples are classified into two groups based on whether the log10 of the TotalExpenses variable is greater than or less than 1. This 'model' achieves an accuracy of 0.74.

In [6]:
le = LabelEncoder()
y_true = le.fit_transform(df_train['Transported'])  
y_pred = le.fit_transform(df_train['log10_TotalExpenses1'])   

print(pd.crosstab(df['Transported'], df_train['log10_TotalExpenses1']))
print(classification_report(y_true, y_pred))

NameError: name 'df' is not defined

### Decision trees models

Feature combinations to test are chosen according to the EDA findings.

In [None]:
onehot_homeplanet = (list(filter(lambda x: any(map(lambda y: y in x,['HomePlanet_'])), df_train.columns)))
main_homeplanet = list(filter(lambda x: any(map(lambda y: y not in x, ['Mars'])),onehot_homeplanet))
onehot_destination = (list(filter(lambda x: any(map(lambda y: y in x,['Destination_'])), df_train.columns)))
main_destination = list(filter(lambda x: any(map(lambda y: y not in x, ['PSO'])),onehot_destination))
expenses_features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
log10_expenses_features = ['log10_RoomService', 'log10_FoodCourt', 'log10_ShoppingMall', 'log10_Spa', 'log10_VRDeck']
log10_expenses_features1 = ['log10_RoomService1', 'log10_FoodCourt1', 'log10_ShoppingMall1', 'log10_Spa1', 'log10_VRDeck1']

cryosleepAge12 = ['CryoSleep_True','Age12','Deck_B','Side_S'] + main_homeplanet + main_destination
log10TE = ['log10_TotalExpenses','Deck_B','Side_S'] + main_homeplanet + main_destination
log10TE1 = ['log10_TotalExpenses1','Deck_B','Side_S'] + main_homeplanet + main_destination
expenses = expenses_features + ['Deck_B','Side_S'] + main_homeplanet + main_destination
log10Expenses = log10_expenses_features + ['Deck_B','Side_S'] + main_homeplanet + main_destination
log10Expenses1 = log10_expenses_features1 + ['Deck_B','Side_S'] + main_homeplanet + main_destination

The 'max_depth' hyperparameter is tested with various values, using all the feature combinations described above.

In [None]:
# combinations of model to check (algorithm + parameters)
dataproc = {'cryosleepAge12': cryosleepAge12, 'log10TE': log10TE, 'log10TE1': log10TE1, 
            'expenses': expenses, 'log10Expenses': log10Expenses, 'log10Expenses1': log10Expenses1} 
experiments = {
    'model': ['DecisionTree'],
    'algorithms': [DecisionTreeClassifier(criterion='gini')],
    'parameters': [{'max_depth': np.arange(2, 15)}]
}
results = run_experiments(dataproc, experiments, X_train, y_train)

The highest performance is achieved with the 'expenses', 'log10Expenses', and 'log10Expenses1' datasets.

In [None]:
max_indices = pd.DataFrame(results).drop(['features'],axis=1).groupby('feature_name')['test_score'].idxmax()
df_max = pd.DataFrame(results).drop(['features'],axis=1).loc[max_indices]
expand_parameters(df_max)

It is reduced to the allowed range of values for 'max_depth' and tested with the hyperparameters 'min_samples_leaf', 'min_samples_split', and 'splitter'.

In [None]:
# combinations of model to check (algorithm + parameters)
dataproc = {'cryosleepAge12': cryosleepAge12, 'log10TE': log10TE, 'log10TE1': log10TE1, 
            'expenses': expenses, 'log10Expenses': log10Expenses, 'log10Expenses1': log10Expenses1} 
experiments = {
    'model': ['DecisionTree'],
    'algorithms': [DecisionTreeClassifier(criterion='gini')],
    'parameters': [{'max_depth': np.arange(2, 8),
                    'min_samples_leaf': np.arange(2,10),
                    'min_samples_split': np.arange(2,10),
                    'splitter': ['best','random']}]
}
results = run_experiments(dataproc, experiments, X_train, y_train)

When optimization of more hyperparameters is allowed, better score values are achieved. The best values still appear for the feature sets 'expenses', 'log10Expenses', and 'log10Expenses1'.

In [None]:
df = pd.DataFrame(results).drop(['features'],axis=1)
df_max3 = df.sort_values(['feature_name', 'test_score'], ascending=[True, False]).groupby('feature_name').head(3)

expand_parameters(df_max3)

In [None]:
# TODO Why 'log10TE1' 2, 2, 2???

Peak scores per dataset:

- 'expense': max_depth=6, min_samples_leaf=4 and any min_samples_split

- 'log10Expenses1': max_depth=6, min_samples_leaf=9 and any min_samples_split

- 'log10Expenses': max_depth=7, min_samples_leaf=7, min_samples_split=3

In [None]:
df = pd.DataFrame(results).drop(['features'], axis=1)
df_sel = df[df['feature_name'].isin(['expenses','log10Expenses','log10Expenses1'])]
df_max3 = df_sel.sort_values(['feature_name', 'test_score'], ascending=[True, False]).groupby('feature_name').head(3)

expand_parameters(df_max3)

In [None]:
df = pd.DataFrame(results).drop(['features'], axis=1)
df_sel = df[df['feature_name'].isin(['expenses'])]
df_max10 = df_sel.sort_values(['test_score'], ascending=[False]).head(10)

expand_parameters(df_max10)

In the previous calculation, some of the best models reached the maximum allowed value for 'min_samples_leaf'. We now test with an extended range for this parameter while reducing the ranges of other hyperparameters to avoid excessive computation time.

In [None]:
# combinations of model to check (algorithm + parameters)
dataproc = {'log10Expenses': log10Expenses, 'log10Expenses1': log10Expenses1} 
experiments = {
    'model': ['DecisionTree'],
    'algorithms': [DecisionTreeClassifier(criterion='gini')],
    'parameters': [{'max_depth': np.arange(5, 8),
                    'min_samples_leaf': np.arange(5,15),
                    'min_samples_split': np.arange(2,10),
                    'splitter': ['best','random']}]
}
results = run_experiments(dataproc, experiments, X_train, y_train)

Similar results has been achieved for new ranges.

In [None]:
df = pd.DataFrame(results).drop(['features'],axis=1)
df_max3 = df.sort_values(['feature_name', 'test_score'], ascending=[True, False]).groupby('feature_name').head(3)
expand_parameters(df_max3)

Only 'max_depth' and 'min_samples_leaf' are considered.

In [None]:
# combinations of model to check (algorithm + parameters)
dataproc = {'log10Expenses': log10Expenses, 'log10Expenses1': log10Expenses1} 
experiments = {
    'model': ['DecisionTree'],
    'algorithms': [DecisionTreeClassifier(criterion='gini')],
    'parameters': [{'max_depth': np.arange(5, 8),
                    'min_samples_leaf': np.arange(5,15),
                    'splitter': ['best','random']}]
}
results = run_experiments(dataproc, experiments, X_train, y_train)

Results are worst than previous ones so three hyperparameters should be considered for the optimization.

In [None]:
df = pd.DataFrame(results).drop(['features'],axis=1)
df_max3 = df.sort_values(['feature_name', 'test_score'], ascending=[True, False]).groupby('feature_name').head(3)
expand_parameters(df_max3)

New set of parameters is tested. 'CrySleep' and 'Age12' or 'Age' is also considered with log10Expenses dataset.

In [None]:
log10ExpensesCryoSleep = ['CryoSleep_True'] + log10_expenses_features + ['Deck_B','Side_S'] + main_homeplanet + main_destination
log10ExpensesCryoSleepAge12 = ['CryoSleep_True','Age12'] + log10_expenses_features + ['Deck_B','Side_S'] + main_homeplanet + main_destination
log10ExpensesCryoSleepAge = ['CryoSleep_True','Age'] + log10_expenses_features + ['Deck_B','Side_S'] + main_homeplanet + main_destination

'max_depth', 'min_samples_leaf' and 'min_samples_split' are considered. 'splitter' is set to 'random'

In [None]:
# combinations of model to check (algorithm + parameters)
dataproc = {'log10ExpensesCryoSleep': log10ExpensesCryoSleep, 
            'log10ExpensesCryoSleepAge12': log10ExpensesCryoSleepAge12, 
            'log10ExpensesCryoSleepAge': log10ExpensesCryoSleepAge} 
experiments = {
    'model': ['DecisionTree'],
    'algorithms': [DecisionTreeClassifier(criterion='gini', splitter='random')],
    'parameters': [{'max_depth': np.arange(6, 20),
                    'min_samples_leaf': np.arange(5,20),
                    'min_samples_split': np.arange(5,15)}]
}
results = run_experiments(dataproc, experiments, X_train, y_train)

Better scores are achieved when 'CryoSleep' and 'Age12' is added. 'Age12' has more information than 'Age'. Best model use 'log10ExpensesCryoSleepAge12' dataset and has these hyperparameters:

- max_depth = 17
- min_samples_leaf = 16
- min_samples_split = 10

test_score = 0.787

In [None]:
df = pd.DataFrame(results).drop(['features'],axis=1)
df_max3 = df.sort_values(['feature_name', 'test_score'], ascending=[True, False]).groupby('feature_name').head(3)
expand_parameters(df_max3)

In [None]:
# TODO list of list of parameters --> All combinations from list of list minimum with two parameters

In [21]:
# Buils list of list with features to combine
# TODO Could this be improved?
onehot_homeplanet = (list(filter(lambda x: any(map(lambda y: y in x,['HomePlanet_'])), df_train.columns)))
main_homeplanet = list(filter(lambda x: any(map(lambda y: y not in x, ['Mars'])),onehot_homeplanet))
onehot_destination = (list(filter(lambda x: any(map(lambda y: y in x,['Destination_'])), df_train.columns)))
main_destination = list(filter(lambda x: any(map(lambda y: y not in x, ['PSO'])),onehot_destination))
onehot_deck = (list(filter(lambda x: any(map(lambda y: y in x,['Deck_'])), df_train.columns)))
expenses_features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
log10_expenses_features = ['log10_RoomService', 'log10_FoodCourt', 'log10_ShoppingMall', 'log10_Spa', 'log10_VRDeck']
log10_expenses_features1 = ['log10_RoomService1', 'log10_FoodCourt1', 'log10_ShoppingMall1', 'log10_Spa1', 'log10_VRDeck1']

list_of_features = [['Age','Age12'], 
                    ['CryoSleep'], 
                    ['Side_S'], 
                    [expenses_features, log10_expenses_features, 'log10_TotalExpenses', log10_expenses_features1, 'log10_TotalExpenses1'],
                    ['Deck_B',['Deck_B','Deck_C','Deck_E','Deck_F'], onehot_deck],
                    [main_homeplanet],
                    [main_destination]
                   ]

In [7]:
# Test with largest list of features and more complicated model to check time and then build test
lst_datasets = constrained_combinations(list_of_features)
print(f'Number of datasets to test: {len(lst_datasets)}\n')
features_max = list(filter(lambda x: len(x) == max(list(map(len, lst_datasets))), lst_datasets))[1]
features_min = list(filter(lambda x: len(x) == min(list(map(len, lst_datasets))), lst_datasets))[1]

params = {'max_depth': np.arange(3, 22, 3),
          'min_samples_leaf': np.arange(3, 22, 3),
          'min_samples_split': np.arange(3, 22, 3),
          'splitter': ['random', 'best']}
grid = GridSearchCV(DecisionTreeClassifier(criterion='gini'), param_grid = params, 
                    cv = 5, scoring= 'accuracy', n_jobs = 3)

time_min, num_tests_min = time_gridsearch(grid, features_min, X_train, y_train)
print(f'Num features: {len(features_min)} Num comb hyperparam: {num_tests_min} Execution time: {time_min:.2f}')
time_max, num_tests_max = time_gridsearch(grid, features_max, X_train, y_train)
print(f'Num features: {len(features_max)} Num comb hyperparam: {num_tests_max} Execution time: {time_max:.2f}')

Number of datasets to test: 1137

Num features: 2 Num comb hyperparam: 686 Execution time: 11.52
Num features: 20 Num comb hyperparam: 686 Execution time: 40.14


In [None]:
# combinations of model to check (algorithm + parameters)
#lst_datasets_sel = lst_datasets[:3]
dict_datasets = {f'dataset{i+1}': lst for i, lst in enumerate(lst_datasets)}
#dict_datasets = {f'dataset{i+1}': lst for i, lst in enumerate(lst_datasets_sel)}
experiments = {
    'model': ['DecisionTree'],
    'algorithms': [DecisionTreeClassifier(criterion='gini')],
    'parameters': [{'max_depth': np.arange(3, 22, 3),
                    'min_samples_leaf': np.arange(3, 22, 3),
                    'min_samples_split': np.arange(3, 22, 3),
                    'splitter': ['random', 'best']}]
}
results = run_experiments(dict_datasets, experiments, X_train, y_train, scoring = 'accuracy', n_jobs = 2)

Datasets analysis:   0%|                                                                                                     | 0/1137 [00:00<?, ?it/s]

In [12]:
cache_data = dict(results_gridsearch=results)
cache_file = "resultGridSearch.pkl"
cache_dir = "cache"
with open(os.path.join(cache_dir, cache_file), "wb") as f:
        pickle.dump(cache_data, f)
print("Wrote preprocessed data to cache file:", cache_file)

Wrote preprocessed data to cache file: resultGridSearch.pkl


In [14]:
cache_file = "resultGridSearch.pkl"
cache_dir = "cache"
try:
    with open(os.path.join(cache_dir, cache_file), "rb") as f:
        cache_data = pickle.load(f)
    print("Read preprocessed data from cache file:", cache_file)
except:
    pass

Read preprocessed data from cache file: resultGridSearch.pkl


In [15]:
results = cache_data['results_gridsearch']

In [16]:
df = pd.DataFrame(results).drop(['features'],axis=1)
df_max3 = df.sort_values(['feature_name', 'test_score'], ascending=[True, False]).groupby('feature_name').head(3)
expand_parameters(df_max3)

Unnamed: 0,feature_name,model,train_score,test_score,fit_time,max_depth,min_samples_leaf,min_samples_split,splitter
640,dataset1,DecisionTree,0.798804,0.77856,0.01481,21,12,18,random
658,dataset1,DecisionTree,0.79518,0.778445,0.015417,21,18,3,random
538,dataset1,DecisionTree,0.797193,0.778214,0.020238,18,12,12,random
1258,dataset2,DecisionTree,0.789572,0.780515,0.012517,18,18,21,random
1094,dataset2,DecisionTree,0.801133,0.77994,0.01246,15,6,6,random
1122,dataset2,DecisionTree,0.790895,0.779826,0.013573,15,12,6,random
1696,dataset3,DecisionTree,0.785949,0.777524,0.0126,12,9,6,random
1800,dataset3,DecisionTree,0.793944,0.776375,0.013628,15,9,15,random
2026,dataset3,DecisionTree,0.791183,0.775799,0.013011,21,15,18,random


In [11]:
# TODO Save results
# TODO Implement best parameter
# TODO Video Imputation Kaggle
# TODO Random Forest
# TODO Feature importance
# TODO Boosted Tree sklearn
# TODO XGBoost
# TODO HistGradienteBoostingClassifier

In [57]:
best_model = pd.DataFrame(results).sort_values('acc_val', ascending=False).iloc[0,:].to_dict()
for key, value in best_model.items():
    print(f'{key}: {value}')

features: ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Destination_55 Cancri e', 'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars', 'CryoSleep_True']
algorithm: DecisionTreeClassifier(splitter='random')
parameters: {'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 4}
acc_val: 0.7839655609383592
