## Model Training

### Environment setting

In [1]:
import pickle
import os
import itertools
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from utilsML import run_experiments, expand_parameters, time_gridsearch, constrained_combinations
from utilsFE import process_features

### Read processed data

In [2]:
cache_file = "procData.pkl"
cache_dir = "cache"
try:
    with open(os.path.join(cache_dir, cache_file), "rb") as f:
        cache_data = pickle.load(f)
    print("Read preprocessed data from cache file:", cache_file)
except:
    pass

Read preprocessed data from cache file: procData.pkl


In [3]:
df_train = cache_data['proc_data']
df_train.head(5).T

Unnamed: 0,0,1,2,3,4
PassengerId,0001_01,0002_01,0003_01,0003_02,0004_01
HomePlanet,Europa,Earth,Europa,Europa,Earth
CryoSleep,False,False,False,False,False
Cabin,B/0/P,F/0/S,A/0/S,A/0/S,F/1/S
Destination,TRAPPIST-1e,TRAPPIST-1e,TRAPPIST-1e,TRAPPIST-1e,TRAPPIST-1e
Age,0.711945,-0.334037,2.036857,0.293552,-0.891895
VIP,False,False,True,False,False
RoomService,-0.333105,-0.168073,-0.268001,-0.333105,0.125652
FoodCourt,-0.281027,-0.275387,1.959998,0.52301,-0.237159
ShoppingMall,-0.283579,-0.241771,-0.283579,0.336851,-0.031059


In [4]:
X_train = df_train.drop('Transported', axis=1)
y_train = df_train['Transported']

### Basic initial models

Initially, we will evaluate two basic models, each relying on a single predictor variable strongly linked to the target: 'CryoSleep' and 'log10TotalExpenses1'. Predictions are made directly from these inputs and metrics for the classification are calculated.

#### 'CryoSleep' feature

This 'model' achieves an accuracy of 0.72

In [5]:
le = LabelEncoder()
y_true = le.fit_transform(df_train['Transported'])  
y_pred = le.fit_transform(df_train['CryoSleep'])   

print(pd.crosstab(df_train['Transported'], df_train['CryoSleep']))
print(classification_report(y_true, y_pred))

CryoSleep    False  True 
Transported              
False         3761    554
True          1895   2483
              precision    recall  f1-score   support

           0       0.66      0.87      0.75      4315
           1       0.82      0.57      0.67      4378

    accuracy                           0.72      8693
   macro avg       0.74      0.72      0.71      8693
weighted avg       0.74      0.72      0.71      8693



#### 'log10_TotalExpenses1' feature

This 'model' uses a feature calculated from 'TotalExpenses' feature. Samples are classified into two groups based on whether the log10 of the TotalExpenses variable is greater than or less than 1. This 'model' achieves an accuracy of 0.74.

In [None]:
le = LabelEncoder()
y_true = le.fit_transform(df_train['Transported'])  
y_pred = le.fit_transform(df_train['log10_TotalExpenses1'])   

print(pd.crosstab(df['Transported'], df_train['log10_TotalExpenses1']))
print(classification_report(y_true, y_pred))

### Decision trees models

#### Step-by-step aproximation

Feature combinations to test are chosen according to the EDA findings. New combinations of features are built after checking the results.

In [None]:
onehot_homeplanet = (list(filter(lambda x: any(map(lambda y: y in x,['HomePlanet_'])), df_train.columns)))
main_homeplanet = list(filter(lambda x: any(map(lambda y: y not in x, ['Mars'])),onehot_homeplanet))
onehot_destination = (list(filter(lambda x: any(map(lambda y: y in x,['Destination_'])), df_train.columns)))
main_destination = list(filter(lambda x: any(map(lambda y: y not in x, ['PSO'])),onehot_destination))
expenses_features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
log10_expenses_features = ['log10_RoomService', 'log10_FoodCourt', 'log10_ShoppingMall', 'log10_Spa', 'log10_VRDeck']
log10_expenses_features1 = ['log10_RoomService1', 'log10_FoodCourt1', 'log10_ShoppingMall1', 'log10_Spa1', 'log10_VRDeck1']

cryosleepAge12 = ['CryoSleep_True','Age12','Deck_B','Side_S'] + main_homeplanet + main_destination
log10TE = ['log10_TotalExpenses','Deck_B','Side_S'] + main_homeplanet + main_destination
log10TE1 = ['log10_TotalExpenses1','Deck_B','Side_S'] + main_homeplanet + main_destination
expenses = expenses_features + ['Deck_B','Side_S'] + main_homeplanet + main_destination
log10Expenses = log10_expenses_features + ['Deck_B','Side_S'] + main_homeplanet + main_destination
log10Expenses1 = log10_expenses_features1 + ['Deck_B','Side_S'] + main_homeplanet + main_destination

The 'max_depth' hyperparameter is tested with various values, using all the feature combinations described above.

In [None]:
# combinations of model to check (algorithm + parameters)
dataproc = {'cryosleepAge12': cryosleepAge12, 'log10TE': log10TE, 'log10TE1': log10TE1, 
            'expenses': expenses, 'log10Expenses': log10Expenses, 'log10Expenses1': log10Expenses1} 
experiments = {
    'model': ['DecisionTree'],
    'algorithms': [DecisionTreeClassifier(criterion='gini')],
    'parameters': [{'max_depth': np.arange(2, 15)}]
}
results = run_experiments(dataproc, experiments, X_train, y_train)

The highest performance is achieved with the 'expenses', 'log10Expenses', and 'log10Expenses1' datasets.

In [None]:
max_indices = pd.DataFrame(results).drop(['features'],axis=1).groupby('feature_name')['test_score'].idxmax()
df_max = pd.DataFrame(results).drop(['features'],axis=1).loc[max_indices]
expand_parameters(df_max)

It is reduced to the allowed range of values for 'max_depth' and tested with the hyperparameters 'min_samples_leaf', 'min_samples_split', and 'splitter'.

In [None]:
# combinations of model to check (algorithm + parameters)
dataproc = {'cryosleepAge12': cryosleepAge12, 'log10TE': log10TE, 'log10TE1': log10TE1, 
            'expenses': expenses, 'log10Expenses': log10Expenses, 'log10Expenses1': log10Expenses1} 
experiments = {
    'model': ['DecisionTree'],
    'algorithms': [DecisionTreeClassifier(criterion='gini')],
    'parameters': [{'max_depth': np.arange(2, 8),
                    'min_samples_leaf': np.arange(2,10),
                    'min_samples_split': np.arange(2,10),
                    'splitter': ['best','random']}]
}
results = run_experiments(dataproc, experiments, X_train, y_train)

When optimization of more hyperparameters is allowed, better score values are achieved. The best values still appear for the feature sets 'expenses', 'log10Expenses', and 'log10Expenses1'.

In [None]:
df = pd.DataFrame(results).drop(['features'],axis=1)
df_max3 = df.sort_values(['feature_name', 'test_score'], ascending=[True, False]).groupby('feature_name').head(3)

expand_parameters(df_max3)

In [None]:
# TODO Why 'log10TE1' 2, 2, 2???

Peak scores per dataset:

- 'expense': max_depth=6, min_samples_leaf=4 and any min_samples_split

- 'log10Expenses1': max_depth=6, min_samples_leaf=9 and any min_samples_split

- 'log10Expenses': max_depth=7, min_samples_leaf=7, min_samples_split=3

In [None]:
df = pd.DataFrame(results).drop(['features'], axis=1)
df_sel = df[df['feature_name'].isin(['expenses','log10Expenses','log10Expenses1'])]
df_max3 = df_sel.sort_values(['feature_name', 'test_score'], ascending=[True, False]).groupby('feature_name').head(3)

expand_parameters(df_max3)

In [None]:
df = pd.DataFrame(results).drop(['features'], axis=1)
df_sel = df[df['feature_name'].isin(['expenses'])]
df_max10 = df_sel.sort_values(['test_score'], ascending=[False]).head(10)

expand_parameters(df_max10)

In the previous calculation, some of the best models reached the maximum allowed value for 'min_samples_leaf'. We now test with an extended range for this parameter while reducing the ranges of other hyperparameters to avoid excessive computation time.

In [None]:
# combinations of model to check (algorithm + parameters)
dataproc = {'log10Expenses': log10Expenses, 'log10Expenses1': log10Expenses1} 
experiments = {
    'model': ['DecisionTree'],
    'algorithms': [DecisionTreeClassifier(criterion='gini')],
    'parameters': [{'max_depth': np.arange(5, 8),
                    'min_samples_leaf': np.arange(5,15),
                    'min_samples_split': np.arange(2,10),
                    'splitter': ['best','random']}]
}
results = run_experiments(dataproc, experiments, X_train, y_train)

Similar results has been achieved for new ranges.

In [None]:
df = pd.DataFrame(results).drop(['features'],axis=1)
df_max3 = df.sort_values(['feature_name', 'test_score'], ascending=[True, False]).groupby('feature_name').head(3)
expand_parameters(df_max3)

Only 'max_depth' and 'min_samples_leaf' are considered.

In [None]:
# combinations of model to check (algorithm + parameters)
dataproc = {'log10Expenses': log10Expenses, 'log10Expenses1': log10Expenses1} 
experiments = {
    'model': ['DecisionTree'],
    'algorithms': [DecisionTreeClassifier(criterion='gini')],
    'parameters': [{'max_depth': np.arange(5, 8),
                    'min_samples_leaf': np.arange(5,15),
                    'splitter': ['best','random']}]
}
results = run_experiments(dataproc, experiments, X_train, y_train)

Results are worst than previous ones so three hyperparameters should be considered for the optimization.

In [None]:
df = pd.DataFrame(results).drop(['features'],axis=1)
df_max3 = df.sort_values(['feature_name', 'test_score'], ascending=[True, False]).groupby('feature_name').head(3)
expand_parameters(df_max3)

New set of parameters is tested. 'CrySleep' and 'Age12' or 'Age' is also considered with log10Expenses dataset.

In [39]:
log10ExpensesCryoSleep = ['CryoSleep_True'] + log10_expenses_features + ['Deck_B','Side_S'] + main_homeplanet + main_destination
log10ExpensesCryoSleepAge12 = ['CryoSleep_True','Age12'] + log10_expenses_features + ['Deck_B','Side_S'] + main_homeplanet + main_destination
log10ExpensesCryoSleepAge = ['CryoSleep_True','Age'] + log10_expenses_features + ['Deck_B','Side_S'] + main_homeplanet + main_destination

'max_depth', 'min_samples_leaf' and 'min_samples_split' are considered. 'splitter' is set to 'random'

In [40]:
# combinations of model to check (algorithm + parameters)
dataproc = {'log10ExpensesCryoSleep': log10ExpensesCryoSleep, 
            'log10ExpensesCryoSleepAge12': log10ExpensesCryoSleepAge12, 
            'log10ExpensesCryoSleepAge': log10ExpensesCryoSleepAge} 
experiments = {
    'model': ['DecisionTree'],
    'algorithms': [DecisionTreeClassifier(criterion='gini', splitter='random')],
    'parameters': [{'max_depth': np.arange(6, 20),
                    'min_samples_leaf': np.arange(5,20),
                    'min_samples_split': np.arange(5,15)}]
}
results = run_experiments(dataproc, experiments, X_train, y_train)

Datasets analysis: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [07:46<00:00, 155.57s/it]


Better scores are achieved when 'CryoSleep' and 'Age12' is added. 'Age12' has more information than 'Age'. Best model use 'log10ExpensesCryoSleepAge12' dataset with a test_score 0.787.

In [41]:
df = pd.DataFrame(results).drop(['features'],axis=1)
df_max3 = df.sort_values(['feature_name', 'test_score'], ascending=[True, False]).groupby('feature_name').head(3)
expand_parameters(df_max3)

Unnamed: 0,feature_name,model,train_score,test_score,fit_time,max_depth,min_samples_leaf,min_samples_split
2074,log10ExpensesCryoSleep,DecisionTree,0.790636,0.783736,0.007975,19,17,9
520,log10ExpensesCryoSleep,DecisionTree,0.78799,0.783392,0.007633,9,12,5
745,log10ExpensesCryoSleep,DecisionTree,0.790176,0.782931,0.008602,10,19,10
4871,log10ExpensesCryoSleepAge,DecisionTree,0.79495,0.783391,0.008314,10,12,6
5954,log10ExpensesCryoSleepAge,DecisionTree,0.801392,0.782817,0.008905,17,15,9
4664,log10ExpensesCryoSleepAge,DecisionTree,0.795468,0.782701,0.008155,9,6,9
2819,log10ExpensesCryoSleepAge12,DecisionTree,0.793426,0.787532,0.007792,10,16,14
2583,log10ExpensesCryoSleepAge12,DecisionTree,0.793886,0.786958,0.007665,9,8,8
4181,log10ExpensesCryoSleepAge12,DecisionTree,0.795123,0.786729,0.008445,19,18,6


In [43]:
expand_parameters(df.sort_values(['test_score'], ascending=[False]).head(1))

Unnamed: 0,feature_name,model,train_score,test_score,fit_time,max_depth,min_samples_leaf,min_samples_split
2819,log10ExpensesCryoSleepAge12,DecisionTree,0.793426,0.787532,0.007792,10,16,14


#### Brute force approach

All possible combinations of features are built and tested with several hyperparameter configurations.

In [10]:
# Buils list of list with features to combine
# TODO Could this be improved?
onehot_homeplanet = (list(filter(lambda x: any(map(lambda y: y in x,['HomePlanet_'])), df_train.columns)))
main_homeplanet = list(filter(lambda x: any(map(lambda y: y not in x, ['Mars'])),onehot_homeplanet))
onehot_destination = (list(filter(lambda x: any(map(lambda y: y in x,['Destination_'])), df_train.columns)))
main_destination = list(filter(lambda x: any(map(lambda y: y not in x, ['PSO'])),onehot_destination))
onehot_deck = (list(filter(lambda x: any(map(lambda y: y in x,['Deck_'])), df_train.columns)))
expenses_features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
log10_expenses_features = ['log10_RoomService', 'log10_FoodCourt', 'log10_ShoppingMall', 'log10_Spa', 'log10_VRDeck']
log10_expenses_features1 = ['log10_RoomService1', 'log10_FoodCourt1', 'log10_ShoppingMall1', 'log10_Spa1', 'log10_VRDeck1']

list_of_features = [['Age','Age12'], 
                    ['CryoSleep_True'], 
                    ['Side_S'], 
                    [expenses_features, log10_expenses_features, 'log10_TotalExpenses', log10_expenses_features1, 'log10_TotalExpenses1'],
                    ['Deck_B',['Deck_B','Deck_C','Deck_E','Deck_F'], onehot_deck],
                    [main_homeplanet],
                    [main_destination]
                   ]

In [7]:
# Test with largest list of features and more complicated model to check time and then build test
lst_datasets = constrained_combinations(list_of_features)
print(f'Number of datasets to test: {len(lst_datasets)}\n')
features_max = list(filter(lambda x: len(x) == max(list(map(len, lst_datasets))), lst_datasets))[1]
features_min = list(filter(lambda x: len(x) == min(list(map(len, lst_datasets))), lst_datasets))[1]

params = {'max_depth': np.arange(3, 22, 3),
          'min_samples_leaf': np.arange(3, 22, 3),
          'min_samples_split': np.arange(3, 22, 3),
          'splitter': ['random', 'best']}
grid = GridSearchCV(DecisionTreeClassifier(criterion='gini'), param_grid = params, 
                    cv = 5, scoring= 'accuracy', n_jobs = 3)

time_min, num_tests_min = time_gridsearch(grid, features_min, X_train, y_train)
print(f'Num features: {len(features_min)} Num comb hyperparam: {num_tests_min} Execution time: {time_min:.2f}')
time_max, num_tests_max = time_gridsearch(grid, features_max, X_train, y_train)
print(f'Num features: {len(features_max)} Num comb hyperparam: {num_tests_max} Execution time: {time_max:.2f}')

Number of datasets to test: 1137

Num features: 2 Num comb hyperparam: 686 Execution time: 11.52
Num features: 20 Num comb hyperparam: 686 Execution time: 40.14


In [22]:
# combinations of model to check (algorithm + parameters)
#lst_datasets_sel = lst_datasets[:3]
dict_datasets = {f'dataset{i+1}': lst for i, lst in enumerate(lst_datasets)}
#dict_datasets = {f'dataset{i+1}': lst for i, lst in enumerate(lst_datasets_sel)}
experiments = {
    'model': ['DecisionTree'],
    'algorithms': [DecisionTreeClassifier(criterion='gini')],
    'parameters': [{'max_depth': np.arange(3, 22, 3),
                    'min_samples_leaf': np.arange(3, 22, 3),
                    'min_samples_split': np.arange(3, 22, 3),
                    'splitter': ['random', 'best']}]
}
results = run_experiments(dict_datasets, experiments, X_train, y_train, scoring = 'accuracy', n_jobs = 2)

Datasets analysis: 100%|███████████████████████████████████████████████████████████████████████████████████████| 1137/1137 [11:26:49<00:00, 36.24s/it]


In [23]:
# save results in cache
cache_data = dict(results_gridsearch=results)
cache_file = "resultGridSearchDecisionTree.pkl"
cache_dir = "cache"
with open(os.path.join(cache_dir, cache_file), "wb") as f:
        pickle.dump(cache_data, f)
print("Wrote preprocessed data to cache file:", cache_file)

Wrote preprocessed data to cache file: resultGridSearch.pkl


In [14]:
cache_file = "resultGridSearchDecisionTree.pkl"
cache_dir = "cache"
try:
    with open(os.path.join(cache_dir, cache_file), "rb") as f:
        cache_data = pickle.load(f)
    print("Read preprocessed data from cache file:", cache_file)
except:
    pass
results = cache_data['results_gridsearch']

Read preprocessed data from cache file: resultGridSearch.pkl


In [31]:
df = pd.DataFrame(results).drop(['features'],axis=1)
df_max = df.sort_values(['test_score'], ascending=[False]).head(10)
expand_parameters(df_max)

Unnamed: 0,feature_name,model,train_score,test_score,fit_time,max_depth,min_samples_leaf,min_samples_split,splitter
276682,dataset404,DecisionTree,0.797481,0.789374,0.017111,9,9,3,random
329904,dataset481,DecisionTree,0.79449,0.789374,0.010945,21,9,15,random
276990,dataset404,DecisionTree,0.805303,0.788913,0.018278,18,12,3,random
75314,dataset110,DecisionTree,0.795295,0.788797,0.010556,18,12,15,random
278874,dataset407,DecisionTree,0.793714,0.788683,0.013292,12,15,15,random
724816,dataset1057,DecisionTree,0.810307,0.788568,0.008548,15,3,15,random
464054,dataset677,DecisionTree,0.797423,0.788454,0.007054,12,6,18,random
343482,dataset501,DecisionTree,0.793599,0.788338,0.013885,15,21,12,random
605510,dataset883,DecisionTree,0.797941,0.788338,0.015837,15,15,18,random
71627,dataset105,DecisionTree,0.800903,0.788222,0.034169,9,21,6,best


In [38]:
# top score datasets
print(lst_datasets[404])
print(lst_datasets[481])
print(lst_datasets[110])
print(lst_datasets[407])
print(lst_datasets[1057])

['Age12', 'CryoSleep', 'Side_S', 'log10_RoomService', 'log10_FoodCourt', 'log10_ShoppingMall', 'log10_Spa', 'log10_VRDeck', 'Deck_B', 'Deck_C', 'Deck_E', 'Deck_F', 'HomePlanet_Earth', 'HomePlanet_Europa']
['Age12', 'CryoSleep', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Deck_B', 'Destination_55 Cancri e', 'Destination_TRAPPIST-1e']
['Age', 'CryoSleep', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Destination_55 Cancri e', 'Destination_TRAPPIST-1e']
['Age12', 'CryoSleep', 'Side_S', 'log10_RoomService', 'log10_FoodCourt', 'log10_ShoppingMall', 'log10_Spa', 'log10_VRDeck', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T', 'HomePlanet_Earth', 'HomePlanet_Europa', 'Destination_55 Cancri e', 'Destination_TRAPPIST-1e']
['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Deck_B', 'Deck_C', 'Deck_E', 'Deck_F', 'HomePlanet_Earth', 'HomePlanet_Europa']


Best score is achieved for the dataset  'Age12', 'CryoSleep_True', 'Side_S', 'log10_RoomService', 'log10_FoodCourt', 'log10_ShoppingMall', 'log10_Spa', 'log10_VRDeck', 'Deck_B', 'Deck_C', 'Deck_E', 'Deck_F', 'HomePlanet_Earth' and 'HomePlanet_Europa'.

Final score for the validation dataset is 0.7894

In [None]:
expand_parameters(df.sort_values(['test_score'], ascending=[False]).head(1))

### Random forest

The subsequent computations will evaluate two feature sets: the first corresponds to the top-performing combination from prior analysis, while the second additionally includes destination feature.

In [52]:
# test for checking time for a rum
features = ['Age12', 'CryoSleep_True', 'Side_S'] + log10_expenses_features + ['Deck_B', 'Deck_C', 'Deck_E', 'Deck_F'] + main_homeplanet + main_destination

params = {'max_depth': np.arange(9, 13),
          'min_samples_leaf': np.arange(9, 13),
          'min_samples_split': np.arange(9, 13)}
grid = GridSearchCV(RandomForestClassifier(criterion='gini', n_estimators=200), param_grid = params, 
                    cv = 5, scoring= 'accuracy', n_jobs = 3)

time, num_tests = time_gridsearch(grid, features, X_train, y_train)
print(f'Num features: {len(features)} Num comb hyperparam: {num_tests} Execution time: {time:.2f}')

Num features: 16 Num comb hyperparam: 64 Execution time: 140.56


In [78]:
# combinations of model to check (algorithm + parameters)
dict_datasets = {'dataset404': ['Age12', 'CryoSleep_True', 'Side_S'] + 
                                 log10_expenses_features + ['Deck_B', 'Deck_C', 'Deck_E', 'Deck_F'] + main_homeplanet,
                 'dataset404plus': ['Age12', 'CryoSleep_True', 'Side_S'] + 
                                     log10_expenses_features + ['Deck_B', 'Deck_C', 'Deck_E', 'Deck_F'] + main_homeplanet + main_destination}
experiments = {
    'model': ['RandomForest'],
    'algorithms': [RandomForestClassifier(criterion='gini')],
    'parameters': [{'n_estimators': [100, 150, 200],
                    'max_depth': np.arange(6, 15, 2),
                    'min_samples_leaf': np.arange(6, 15, 2),
                    'min_samples_split': np.arange(6, 15, 2)}]
}
results = run_experiments(dict_datasets, experiments, X_train, y_train, scoring = 'accuracy', n_jobs = 3)

Datasets analysis: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [20:43<00:00, 621.54s/it]


In [60]:
# save results in cache
cache_data = dict(results_gridsearch=results)
cache_file = "resultGridSearchRandomForest.pkl"
cache_dir = "cache"
with open(os.path.join(cache_dir, cache_file), "wb") as f:
        pickle.dump(cache_data, f)
print("Wrote preprocessed data to cache file:", cache_file)

Wrote preprocessed data to cache file: resultGridSearchRandomForest.pkl


In [96]:
cache_file = "resultGridSearchRandomForest.pkl"
cache_dir = "cache"
try:
    with open(os.path.join(cache_dir, cache_file), "rb") as f:
        cache_data = pickle.load(f)
    print("Read preprocessed data from cache file:", cache_file)
except:
    pass
results = cache_data['results_gridsearch']

Read preprocessed data from cache file: resultGridSearchRandomForest.pkl


In [97]:
df = pd.DataFrame(results).drop(['features'],axis=1)
df_max = df.sort_values(['feature_name','test_score'], ascending=[True,False]).groupby('feature_name').head(5)
expand_parameters(df_max)

Unnamed: 0,feature_name,model,train_score,test_score,fit_time,n_estimators,max_depth,min_samples_leaf,min_samples_split
237,dataset404,RandomForest,0.83041,0.799152,0.580015,150,14,10,10
168,dataset404,RandomForest,0.823623,0.799152,0.546926,150,8,12,12
319,dataset404,RandomForest,0.828253,0.799152,0.888167,200,10,12,14
225,dataset404,RandomForest,0.831474,0.798807,0.579094,150,14,6,6
169,dataset404,RandomForest,0.824025,0.798807,0.807145,150,8,12,14
613,dataset404plus,RandomForest,0.834148,0.801337,0.982262,150,14,10,12
611,dataset404plus,RandomForest,0.83458,0.801108,1.306346,150,14,10,8
610,dataset404plus,RandomForest,0.834925,0.800992,1.014118,150,14,10,6
628,dataset404plus,RandomForest,0.830237,0.800763,0.954336,200,6,6,12
680,dataset404plus,RandomForest,0.83806,0.800648,1.334403,200,10,8,6


In [98]:
expand_parameters(df.sort_values(['test_score'], ascending=[False]).head(1))

Unnamed: 0,feature_name,model,train_score,test_score,fit_time,n_estimators,max_depth,min_samples_leaf,min_samples_split
613,dataset404plus,RandomForest,0.834148,0.801337,0.982262,150,14,10,12


### Boosted Tree

In [67]:
# test for checking time for a rum
features = ['Age12', 'CryoSleep_True', 'Side_S'] + log10_expenses_features + ['Deck_B', 'Deck_C', 'Deck_E', 'Deck_F'] + main_homeplanet + main_destination

params = {'max_depth': np.arange(3,11),
          'learning_rate': [0.001, 0.01, 0.1]}
grid = GridSearchCV(GradientBoostingClassifier(n_estimators=200), param_grid = params, 
                    cv = 5, scoring= 'accuracy', n_jobs = 3)

time, num_tests = time_gridsearch(grid, features, X_train, y_train)
print(f'Num features: {len(features)} Num comb hyperparam: {num_tests} Execution time: {time:.2f}')

Num features: 16 Num comb hyperparam: 24 Execution time: 197.21


In [69]:
# combinations of model to check (algorithm + parameters)
dict_datasets = {'dataset404': ['Age12', 'CryoSleep_True', 'Side_S'] + 
                                 log10_expenses_features + ['Deck_B', 'Deck_C', 'Deck_E', 'Deck_F'] + main_homeplanet,
                 'dataset404plus': ['Age12', 'CryoSleep_True', 'Side_S'] + 
                                     log10_expenses_features + ['Deck_B', 'Deck_C', 'Deck_E', 'Deck_F'] + main_homeplanet + main_destination}
experiments = {
    'model': ['GradientBoosting'],
    'algorithms': [GradientBoostingClassifier()],
    'parameters': [{'n_estimators': [100, 150, 200],
                    'max_depth': np.arange(3, 11),
                    'learning_rate': [0.005, 0.01, 0.05]}]
}
results = run_experiments(dict_datasets, experiments, X_train, y_train, scoring = 'accuracy', n_jobs = 3)

Datasets analysis: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [14:35<00:00, 437.78s/it]


In [70]:
# save results in cache
cache_data = dict(results_gridsearch=results)
cache_file = "resultGridSearchGradientBoosting.pkl"
cache_dir = "cache"
with open(os.path.join(cache_dir, cache_file), "wb") as f:
        pickle.dump(cache_data, f)
print("Wrote preprocessed data to cache file:", cache_file)

Wrote preprocessed data to cache file: resultGridSearchGradientBoosting.pkl


In [99]:
cache_file = "resultGridSearchGradientBoosting.pkl"
cache_dir = "cache"
try:
    with open(os.path.join(cache_dir, cache_file), "rb") as f:
        cache_data = pickle.load(f)
    print("Read preprocessed data from cache file:", cache_file)
except:
    pass
results = cache_data['results_gridsearch']

Read preprocessed data from cache file: resultGridSearchGradientBoosting.pkl


In [100]:
df = pd.DataFrame(results).drop(['features'],axis=1)
df_max = df.sort_values(['feature_name','test_score'], ascending=[True,False]).groupby('feature_name').head(5)
expand_parameters(df_max)

Unnamed: 0,feature_name,model,train_score,test_score,fit_time,n_estimators,max_depth,learning_rate
53,dataset404,GradientBoosting,0.824342,0.799152,2.382591,200.0,4.0,0.05
58,dataset404,GradientBoosting,0.842891,0.798462,2.793725,200.0,6.0,0.01
56,dataset404,GradientBoosting,0.838088,0.797311,3.339809,200.0,5.0,0.05
52,dataset404,GradientBoosting,0.820287,0.797081,1.784688,200.0,4.0,0.01
55,dataset404,GradientBoosting,0.829748,0.797081,2.416682,200.0,5.0,0.01
128,dataset404plus,GradientBoosting,0.841079,0.800878,3.099586,200.0,5.0,0.05
127,dataset404plus,GradientBoosting,0.833401,0.800302,2.3323,200.0,5.0,0.01
131,dataset404plus,GradientBoosting,0.859168,0.799037,3.917619,200.0,6.0,0.05
124,dataset404plus,GradientBoosting,0.81997,0.798692,1.854928,200.0,4.0,0.01
130,dataset404plus,GradientBoosting,0.849132,0.798347,2.911682,200.0,6.0,0.01


In [101]:
expand_parameters(df.sort_values(['test_score'], ascending=[False]).head(1))

Unnamed: 0,feature_name,model,train_score,test_score,fit_time,n_estimators,max_depth,learning_rate
128,dataset404plus,GradientBoosting,0.841079,0.800878,3.099586,200.0,5.0,0.05


### XGBoosted tree

In [76]:
# test for checking time for a rum
features = ['Age12', 'CryoSleep_True', 'Side_S'] + log10_expenses_features + ['Deck_B', 'Deck_C', 'Deck_E', 'Deck_F'] + main_homeplanet + main_destination

params = {'max_depth': np.arange(3,11),
          'eta': [0.001, 0.01, 0.1]}
grid = GridSearchCV(XGBClassifier(n_estimators=200), param_grid = params, 
                    cv = 5, scoring= 'accuracy', n_jobs = 3)

time, num_tests = time_gridsearch(grid, features, X_train, y_train)
print(f'Num features: {len(features)} Num comb hyperparam: {num_tests} Execution time: {time:.2f}')

Num features: 16 Num comb hyperparam: 24 Execution time: 17.77


In [80]:
# combinations of model to check (algorithm + parameters)
dict_datasets = {'dataset404': ['Age12', 'CryoSleep_True', 'Side_S'] + 
                                 log10_expenses_features + ['Deck_B', 'Deck_C', 'Deck_E', 'Deck_F'] + main_homeplanet,
                 'dataset404plus': ['Age12', 'CryoSleep_True', 'Side_S'] + 
                                     log10_expenses_features + ['Deck_B', 'Deck_C', 'Deck_E', 'Deck_F'] + main_homeplanet + main_destination}
experiments = {
    'model': ['XGBoost'],
    'algorithms': [XGBClassifier()],
    'parameters': [{'n_estimators': [100, 150, 200],
                    'max_depth': np.arange(3, 11),
                    'eta': [0.05, 0.07, 0.1, 0.3]}]
}
results = run_experiments(dict_datasets, experiments, X_train, y_train, scoring = 'accuracy', n_jobs = 3)

Datasets analysis: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [01:48<00:00, 54.31s/it]


In [82]:
# save results in cache
cache_data = dict(results_gridsearch=results)
cache_file = "resultGridSearchXGBoost.pkl"
cache_dir = "cache"
with open(os.path.join(cache_dir, cache_file), "wb") as f:
        pickle.dump(cache_data, f)
print("Wrote preprocessed data to cache file:", cache_file)

Wrote preprocessed data to cache file: resultGridSearchXGBoost.pkl


In [102]:
cache_file = "resultGridSearchXGBoost.pkl"
cache_dir = "cache"
try:
    with open(os.path.join(cache_dir, cache_file), "rb") as f:
        cache_data = pickle.load(f)
    print("Read preprocessed data from cache file:", cache_file)
except:
    pass
results = cache_data['results_gridsearch']

Read preprocessed data from cache file: resultGridSearchXGBoost.pkl


In [103]:
df = pd.DataFrame(results).drop(['features'],axis=1)
df_max = df.sort_values(['feature_name','test_score'], ascending=[True,False]).groupby('feature_name').head(5)
expand_parameters(df_max)

Unnamed: 0,feature_name,model,train_score,test_score,fit_time,n_estimators,max_depth,eta
53,dataset404,XGBoost,0.828512,0.800992,0.190717,150.0,8.0,0.07
9,dataset404,XGBoost,0.823364,0.800878,0.174763,100.0,5.0,0.07
52,dataset404,XGBoost,0.823162,0.800762,0.146287,150.0,8.0,0.05
55,dataset404,XGBoost,0.831244,0.800416,0.218031,150.0,8.0,0.3
36,dataset404,XGBoost,0.833889,0.800302,0.202949,150.0,4.0,0.05
109,dataset404plus,XGBoost,0.83642,0.801798,0.329512,100.0,6.0,0.07
149,dataset404plus,XGBoost,0.830611,0.800993,0.220717,150.0,8.0,0.07
132,dataset404plus,XGBoost,0.835011,0.800877,0.223152,150.0,4.0,0.05
150,dataset404plus,XGBoost,0.827189,0.800762,0.145875,150.0,8.0,0.1
148,dataset404plus,XGBoost,0.824888,0.800533,0.168451,150.0,8.0,0.05


In [104]:
expand_parameters(df.sort_values(['test_score'], ascending=[False]).head(1))

Unnamed: 0,feature_name,model,train_score,test_score,fit_time,n_estimators,max_depth,eta
109,dataset404plus,XGBoost,0.83642,0.801798,0.329512,100.0,6.0,0.07


### Support vector machine

In [91]:
# test for checking time for a rum
features = ['Age12', 'CryoSleep_True', 'Side_S'] + log10_expenses_features + ['Deck_B', 'Deck_C', 'Deck_E', 'Deck_F'] + main_homeplanet + main_destination

params = {'kernel': ['linear','rbf','sigmoid'],
          'gamma': np.logspace(-1,-3,5)}
grid = GridSearchCV(SVC(), param_grid = params, 
                    cv = 5, scoring= 'accuracy', n_jobs = 3)

time, num_tests = time_gridsearch(grid, features, X_train, y_train)
print(f'Num features: {len(features)} Num comb hyperparam: {num_tests} Execution time: {time:.2f}')

Num features: 16 Num comb hyperparam: 15 Execution time: 58.69


In [89]:
# test for checking time for a rum
features = ['Age12', 'CryoSleep_True', 'Side_S'] + log10_expenses_features + ['Deck_B', 'Deck_C', 'Deck_E', 'Deck_F'] + main_homeplanet + main_destination

params = {'degree': np.arange(3,6)}
grid = GridSearchCV(SVC(kernel='poly'), param_grid = params, 
                    cv = 5, scoring= 'accuracy', n_jobs = 3)

time, num_tests = time_gridsearch(grid, features, X_train, y_train)
print(f'Num features: {len(features)} Num comb hyperparam: {num_tests} Execution time: {time:.2f}')

Num features: 16 Num comb hyperparam: 3 Execution time: 10.28


In [93]:
# combinations of model to check (algorithm + parameters)
dict_datasets = {'dataset404': ['Age12', 'CryoSleep_True', 'Side_S'] + 
                                 log10_expenses_features + ['Deck_B', 'Deck_C', 'Deck_E', 'Deck_F'] + main_homeplanet,
                 'dataset404plus': ['Age12', 'CryoSleep_True', 'Side_S'] + 
                                     log10_expenses_features + ['Deck_B', 'Deck_C', 'Deck_E', 'Deck_F'] + main_homeplanet + main_destination}
experiments = {
    'model': ['SVCnopoly','SVCpoly'],
    'algorithms': [SVC(),SVC(kernel='poly')],
    'parameters': [{'kernel':['linear', 'rbf', 'sigmoid'],
                    'gamma': np.logspace(-1,-4,10)},
                   {'degree': np.arange(3,9)}]
}
results = run_experiments(dict_datasets, experiments, X_train, y_train, scoring = 'accuracy', n_jobs = 3)

Datasets analysis: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [07:46<00:00, 233.34s/it]


In [94]:
# save results in cache
cache_data = dict(results_gridsearch=results)
cache_file = "resultGridSearchSVM.pkl"
cache_dir = "cache"
with open(os.path.join(cache_dir, cache_file), "wb") as f:
        pickle.dump(cache_data, f)
print("Wrote preprocessed data to cache file:", cache_file)

Wrote preprocessed data to cache file: resultGridSearchSVM.pkl


In [105]:
cache_file = "resultGridSearchSVM.pkl"
cache_dir = "cache"
try:
    with open(os.path.join(cache_dir, cache_file), "rb") as f:
        cache_data = pickle.load(f)
    print("Read preprocessed data from cache file:", cache_file)
except:
    pass
results = cache_data['results_gridsearch']

Read preprocessed data from cache file: resultGridSearchSVM.pkl


In [106]:
df = pd.DataFrame(results).drop(['features'],axis=1)
df_max = df.sort_values(['feature_name','test_score'], ascending=[True,False]).groupby('feature_name').head(5)
expand_parameters(df_max)

Unnamed: 0,feature_name,model,train_score,test_score,fit_time,degree,kernel,gamma
31,dataset404,SVCpoly,0.815743,0.799842,1.635352,4.0,,
30,dataset404,SVCpoly,0.80979,0.798691,1.398815,3.0,,
1,dataset404,SVCnopoly,0.805073,0.797887,1.483562,,linear,0.046416
32,dataset404,SVCpoly,0.820833,0.797656,1.89937,5.0,,
4,dataset404,SVCnopoly,0.797251,0.79317,1.363338,,linear,0.004642
37,dataset404plus,SVCnopoly,0.808179,0.798692,1.286056,,linear,0.046416
66,dataset404plus,SVCpoly,0.812435,0.798001,1.249018,3.0,,
67,dataset404plus,SVCpoly,0.820344,0.796736,1.529277,4.0,,
68,dataset404plus,SVCpoly,0.828742,0.796161,1.622667,5.0,,
40,dataset404plus,SVCnopoly,0.798114,0.794665,1.190076,,linear,0.004642


In [107]:
expand_parameters(df.sort_values(['test_score'], ascending=[False]).head(1))

Unnamed: 0,feature_name,model,train_score,test_score,fit_time,degree
31,dataset404,SVCpoly,0.815743,0.799842,1.635352,4


Best score (0.8018) is achieved using a XGBoost (n_estimators = 100.0, max_depth =6.0, eta = 0.07) 

Features use in this model: 'Age12', 'CryoSleep_True', 'Side_S', 'log10_RoomService', 'log10_FoodCourt', 'log10_ShoppingMall', 'log10_Spa', 'log10_VRDeck', 'Deck_B', 'Deck_C', 'Deck_E', 'Deck_F', 'HomePlanet_Earth', 'HomePlanet_Europa', 'Destination_55 Cancri e' and 'Destination_TRAPPIST-1e'.

### Train model

Model with the best score is trained

In [11]:
features = ['Age12', 'CryoSleep_True', 'Side_S'] + log10_expenses_features + ['Deck_B', 'Deck_C', 'Deck_E', 'Deck_F'] + main_homeplanet + main_destination
X_train_sel = X_train[features]
model = XGBClassifier(n_estimators=100, max_depth=6, eta=0.07)
model.fit(X_train_sel, y_train)
y_pred = model.predict(X_train_sel)
y_train_pred = pd.Series(y_pred, name='Pred')
print(pd.crosstab(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

Pred            0     1
Transported            
False        3417   898
True          608  3770
              precision    recall  f1-score   support

       False       0.85      0.79      0.82      4315
        True       0.81      0.86      0.83      4378

    accuracy                           0.83      8693
   macro avg       0.83      0.83      0.83      8693
weighted avg       0.83      0.83      0.83      8693



### Test model

Model is tested after processing the data test with the same pipeline used for train data

In [12]:
# load data
df_test = pd.read_csv('data/test.csv')
df_test.head(5)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [13]:
# same processing to test data
preprocessor = cache_data['preprocessor']
encoder = cache_data['encoder']
scaler_num = cache_data['scaler']
df_proc_test = process_features(df_test, preprocessor, encoder, scaler_num)

In [14]:
df_proc_test.head(4)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,CryoSleep_True,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Side_S
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,-0.124841,False,-0.333105,-0.281027,-0.283579,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,-0.682698,False,-0.333105,-0.275387,-0.283579,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0019_01,Europa,True,C/0/S,55 Cancri e,0.154088,False,-0.333105,-0.281027,-0.283579,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,0.642213,False,-0.333105,3.88768,-0.283579,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


Prediction for data test (score = 0.80289)

In [23]:
X_test = df_proc_test[features]
y_pred = model.predict(X_test)
y_test_pred = list(map(lambda x: True if x == 1 else False, y_pred))
test_result = pd.DataFrame({'PassengerId': df_proc_test['PassengerId'], 'Transported': y_test_pred})
test_result.to_csv('test_result.csv', index=False)

In [11]:
# TODO Video Imputation Kaggle
# TODO Feature importance (Random Forest)
# TODO HistGradienteBoostingClassifier (Gradient Boosting)