## Model Training (CryoSleep TRUE)

### Environment setting

In [64]:
import pickle
import os
import itertools
import pandas as pd
import numpy as np
#from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.linear_model import RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from utilsML import run_experiments, expand_parameters, time_gridsearch, constrained_combinations
#from utilsFE import process_features

### Read processed data

In [16]:
cache_file = "procData.pkl"
cache_dir = "cache"
try:
    with open(os.path.join(cache_dir, cache_file), "rb") as f:
        cache_data = pickle.load(f)
    print("Read preprocessed data from cache file:", cache_file)
except:
    pass

Read preprocessed data from cache file: procData.pkl


In [17]:
proc_data = cache_data['proc_data']
df_train = proc_data[proc_data['CryoSleep'] == True].copy()
df_train.head(5).T

Unnamed: 0,7,9,10,18,21
PassengerId,0006_02,0008_01,0008_02,0016_01,0020_01
HomePlanet,Earth,Europa,Europa,Mars,Earth
CryoSleep,True,True,True,True,True
Cabin,G/0/S,B/1/P,B/1/P,F/5/P,E/0/S
Destination,TRAPPIST-1e,55 Cancri e,TRAPPIST-1e,TRAPPIST-1e,TRAPPIST-1e
Age,-0.055109,-1.031359,0.363284,1.130339,-1.937878
VIP,False,False,False,False,False
RoomService,-0.333105,-0.333105,-0.333105,-0.333105,-0.333105
FoodCourt,-0.281027,-0.281027,-0.281027,-0.281027,-0.281027
ShoppingMall,-0.283579,-0.283579,-0.283579,-0.283579,-0.283579


In [18]:
X_train = df_train.drop('Transported', axis=1)
y_train = df_train['Transported']

#### Brute force approach

All possible combinations of features are built and tested with several hyperparameter configurations.

In [25]:
# Buils list of list with features to combine
# TODO Could this be improved?
onehot_homeplanet = (list(filter(lambda x: any(map(lambda y: y in x,['HomePlanet_'])), df_train.columns)))
main_homeplanet = list(filter(lambda x: any(map(lambda y: y not in x, ['Mars'])),onehot_homeplanet))
onehot_destination = (list(filter(lambda x: any(map(lambda y: y in x,['Destination_'])), df_train.columns)))
main_destination = list(filter(lambda x: any(map(lambda y: y not in x, ['TRAPPIST'])),onehot_destination))
onehot_deck = (list(filter(lambda x: any(map(lambda y: y in x,['Deck_'])), df_train.columns)))

list_of_features = [['Age','Age12'], 
                    ['Num','Num500'],
                    ['Side_S'], 
                    ['Deck_G',['Deck_G','Deck_C','Deck_B','Deck_F'], onehot_deck],
                    [['HomePlanet_Earth'],main_homeplanet],
                    [['Destination_55 Cancri e'],main_destination]
                   ]

In [27]:
# Test with largest list of features and more complicated model to check time and then build test
lst_datasets = constrained_combinations(list_of_features)
print(f'Number of datasets to test: {len(lst_datasets)}\n')
features_max = list(filter(lambda x: len(x) == max(list(map(len, lst_datasets))), lst_datasets))[1]
features_min = list(filter(lambda x: len(x) == min(list(map(len, lst_datasets))), lst_datasets))[1]

params = {'max_depth': np.arange(3, 22, 3),
          'min_samples_leaf': np.arange(3, 22, 3),
          'min_samples_split': np.arange(3, 22, 3),
          'splitter': ['random', 'best']}
grid = GridSearchCV(DecisionTreeClassifier(criterion='gini'), param_grid = params, 
                    cv = 5, scoring= 'accuracy', n_jobs = 3)

time_min, num_tests_min = time_gridsearch(grid, features_min, X_train, y_train)
print(f'Num features: {len(features_min)} Num comb hyperparam: {num_tests_min} Execution time: {time_min:.2f}')
time_max, num_tests_max = time_gridsearch(grid, features_max, X_train, y_train)
print(f'Num features: {len(features_max)} Num comb hyperparam: {num_tests_max} Execution time: {time_max:.2f}')

Number of datasets to test: 635

Num features: 2 Num comb hyperparam: 686 Execution time: 6.72
Num features: 15 Num comb hyperparam: 686 Execution time: 8.25


In [28]:
# combinations of model to check (algorithm + parameters)
#lst_datasets_sel = lst_datasets[:3]
dict_datasets = {f'dataset{i+1}': lst for i, lst in enumerate(lst_datasets)}
#dict_datasets = {f'dataset{i+1}': lst for i, lst in enumerate(lst_datasets_sel)}
experiments = {
    'model': ['DecisionTree'],
    'algorithms': [DecisionTreeClassifier(criterion='gini')],
    'parameters': [{'max_depth': np.arange(3, 22, 3),
                    'min_samples_leaf': np.arange(3, 22, 3),
                    'min_samples_split': np.arange(3, 22, 3),
                    'splitter': ['random', 'best']}]
}
results = run_experiments(dict_datasets, experiments, X_train, y_train, scoring = 'accuracy', n_jobs = 2)

Datasets analysis: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 635/635 [2:41:21<00:00, 15.25s/it]


In [29]:
# save results in cache
cache_data = dict(results_gridsearch=results)
cache_file = "resultGridSearchDecisionTreeCryoSleep.pkl"
cache_dir = "cache"
with open(os.path.join(cache_dir, cache_file), "wb") as f:
        pickle.dump(cache_data, f)
print("Wrote preprocessed data to cache file:", cache_file)

Wrote preprocessed data to cache file: resultGridSearchDecisionTreeCryoSleep.pkl


In [14]:
cache_file = "resultGridSearchDecisionTreeCryoSleep.pkl"
cache_dir = "cache"
try:
    with open(os.path.join(cache_dir, cache_file), "rb") as f:
        cache_data = pickle.load(f)
    print("Read preprocessed data from cache file:", cache_file)
except:
    pass
results = cache_data['results_gridsearch']

Read preprocessed data from cache file: resultGridSearch.pkl


In [30]:
df = pd.DataFrame(results).drop(['features'],axis=1)
df_max = df.sort_values(['test_score'], ascending=[False]).head(10)
expand_parameters(df_max)

Unnamed: 0,feature_name,model,train_score,test_score,fit_time,max_depth,min_samples_leaf,min_samples_split,splitter
114478,dataset167,DecisionTree,0.834787,0.831091,0.003858,21,6,3,random
113516,dataset166,DecisionTree,0.828367,0.830758,0.003399,12,9,9,random
114218,dataset167,DecisionTree,0.827461,0.830099,0.003806,12,12,12,random
113606,dataset166,DecisionTree,0.832894,0.829777,0.003307,15,6,18,random
114880,dataset168,DecisionTree,0.83026,0.829773,0.00362,12,6,18,random
106200,dataset155,DecisionTree,0.819723,0.829115,0.003192,18,15,18,random
113434,dataset166,DecisionTree,0.830837,0.828785,0.00327,9,12,12,random
113546,dataset166,DecisionTree,0.827708,0.828784,0.003236,12,15,12,random
115088,dataset168,DecisionTree,0.830425,0.828783,0.003465,18,9,15,random
114498,dataset167,DecisionTree,0.827049,0.828458,0.003725,21,9,12,random


In [31]:
# top score datasets
print(lst_datasets[167])
print(lst_datasets[166])
print(lst_datasets[168])

['Age', 'Side_S', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T', 'HomePlanet_Earth', 'HomePlanet_Europa']
['Age', 'Side_S', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T', 'HomePlanet_Earth', 'HomePlanet_Europa', 'Destination_55 Cancri e', 'Destination_PSO J318.5-22']
['Age', 'Side_S', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T', 'Destination_55 Cancri e']


Best score is achieved for the dataset  'Age', 'Side_S', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T', 'HomePlanet_Earth' and 'HomePlanet_Europa'.

Final score for the validation dataset is 0.835

### Random forest

The subsequent computations will evaluate two feature sets: the first corresponds to the top-performing combination from prior analysis, while the second additionally includes destination feature.

In [34]:
# test for checking time for a rum
features = ['Age', 'Side_S', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T', 'HomePlanet_Earth', 'HomePlanet_Europa']

params = {'max_depth': np.arange(9, 13),
          'min_samples_leaf': np.arange(9, 13),
          'min_samples_split': np.arange(9, 13)}
grid = GridSearchCV(RandomForestClassifier(criterion='gini', n_estimators=200), param_grid = params, 
                    cv = 5, scoring= 'accuracy', n_jobs = 3)

time, num_tests = time_gridsearch(grid, features, X_train, y_train)
print(f'Num features: {len(features)} Num comb hyperparam: {num_tests} Execution time: {time:.2f}')

Num features: 12 Num comb hyperparam: 64 Execution time: 58.77


In [38]:
# combinations of model to check (algorithm + parameters)
dict_datasets = {'dataset166': lst_datasets[166],
                 'dataset167': lst_datasets[167],
                 'dataset168': lst_datasets[168]
}
experiments = {
    'model': ['RandomForest'],
    'algorithms': [RandomForestClassifier(criterion='gini')],
    'parameters': [{'n_estimators': [100, 150, 200],
                    'max_depth': np.arange(6, 15, 2),
                    'min_samples_leaf': np.arange(6, 15, 2),
                    'min_samples_split': np.arange(6, 15, 2)}]
}
results = run_experiments(dict_datasets, experiments, X_train, y_train, scoring = 'accuracy', n_jobs = 3)

Datasets analysis: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [11:30<00:00, 230.21s/it]


In [39]:
# save results in cache
cache_data = dict(results_gridsearch=results)
cache_file = "resultGridSearchRandomForestCryoSleep.pkl"
cache_dir = "cache"
with open(os.path.join(cache_dir, cache_file), "wb") as f:
        pickle.dump(cache_data, f)
print("Wrote preprocessed data to cache file:", cache_file)

Wrote preprocessed data to cache file: resultGridSearchRandomForestCryoSleep.pkl


In [96]:
cache_file = "resultGridSearchRandomForestCryoSleep.pkl"
cache_dir = "cache"
try:
    with open(os.path.join(cache_dir, cache_file), "rb") as f:
        cache_data = pickle.load(f)
    print("Read preprocessed data from cache file:", cache_file)
except:
    pass
results = cache_data['results_gridsearch']

Read preprocessed data from cache file: resultGridSearchRandomForest.pkl


In [40]:
df = pd.DataFrame(results).drop(['features'],axis=1)
df_max = df.sort_values(['feature_name','test_score'], ascending=[True,False]).groupby('feature_name').head(5)
expand_parameters(df_max)

Unnamed: 0,feature_name,model,train_score,test_score,fit_time,n_estimators,max_depth,min_samples_leaf,min_samples_split
93,dataset166,RandomForest,0.832977,0.821537,0.195597,100,12,12,12
84,dataset166,RandomForest,0.834953,0.82121,0.198673,100,12,8,14
117,dataset166,RandomForest,0.829273,0.821209,0.194099,100,14,12,10
86,dataset166,RandomForest,0.835776,0.82088,0.42239,100,12,10,8
114,dataset166,RandomForest,0.829026,0.820548,0.19335,100,14,10,14
507,dataset167,RandomForest,0.832565,0.828452,0.196479,150,6,8,10
401,dataset167,RandomForest,0.830919,0.828124,0.401542,100,8,6,8
400,dataset167,RandomForest,0.830919,0.828123,0.286732,100,8,6,6
386,dataset167,RandomForest,0.832236,0.828122,0.385029,100,6,10,8
498,dataset167,RandomForest,0.833718,0.827467,0.194246,100,14,14,12


In [41]:
expand_parameters(df.sort_values(['test_score'], ascending=[False]).head(1))

Unnamed: 0,feature_name,model,train_score,test_score,fit_time,n_estimators,max_depth,min_samples_leaf,min_samples_split
507,dataset167,RandomForest,0.832565,0.828452,0.196479,150,6,8,10


### Boosted Tree

In [42]:
# test for checking time for a rum
features = lst_datasets[166]

params = {'max_depth': np.arange(3,11),
          'learning_rate': [0.001, 0.01, 0.1]}
grid = GridSearchCV(GradientBoostingClassifier(n_estimators=200), param_grid = params, 
                    cv = 5, scoring= 'accuracy', n_jobs = 3)

time, num_tests = time_gridsearch(grid, features, X_train, y_train)
print(f'Num features: {len(features)} Num comb hyperparam: {num_tests} Execution time: {time:.2f}')

Num features: 14 Num comb hyperparam: 24 Execution time: 56.95


In [43]:
# combinations of model to check (algorithm + parameters)
dict_datasets = {'dataset166': lst_datasets[166],
                 'dataset167': lst_datasets[167],
                 'dataset168': lst_datasets[168]
}
experiments = {
    'model': ['GradientBoosting'],
    'algorithms': [GradientBoostingClassifier()],
    'parameters': [{'n_estimators': [100, 150, 200],
                    'max_depth': np.arange(3, 11),
                    'learning_rate': [0.005, 0.01, 0.05]}]
}
results = run_experiments(dict_datasets, experiments, X_train, y_train, scoring = 'accuracy', n_jobs = 3)

Datasets analysis: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [05:58<00:00, 119.62s/it]


In [44]:
# save results in cache
cache_data = dict(results_gridsearch=results)
cache_file = "resultGridSearchGradientBoostingCryoSleep.pkl"
cache_dir = "cache"
with open(os.path.join(cache_dir, cache_file), "wb") as f:
        pickle.dump(cache_data, f)
print("Wrote preprocessed data to cache file:", cache_file)

Wrote preprocessed data to cache file: resultGridSearchGradientBoostingCryoSleep.pkl


In [45]:
cache_file = "resultGridSearchGradientBoostingCryoSleep.pkl"
cache_dir = "cache"
try:
    with open(os.path.join(cache_dir, cache_file), "rb") as f:
        cache_data = pickle.load(f)
    print("Read preprocessed data from cache file:", cache_file)
except:
    pass
results = cache_data['results_gridsearch']

Read preprocessed data from cache file: resultGridSearchGradientBoostingCryoSleep.pkl


In [46]:
df = pd.DataFrame(results).drop(['features'],axis=1)
df_max = df.sort_values(['feature_name','test_score'], ascending=[True,False]).groupby('feature_name').head(5)
expand_parameters(df_max)

Unnamed: 0,feature_name,model,train_score,test_score,fit_time,n_estimators,max_depth,learning_rate
48,dataset166,GradientBoosting,0.839069,0.822856,0.315404,200.0,3.0,0.005
49,dataset166,GradientBoosting,0.841291,0.822199,0.394905,200.0,3.0,0.01
32,dataset166,GradientBoosting,0.842855,0.821541,0.937891,150.0,5.0,0.05
51,dataset166,GradientBoosting,0.846807,0.821539,0.359151,200.0,4.0,0.005
31,dataset166,GradientBoosting,0.835281,0.821538,0.711991,150.0,5.0,0.01
104,dataset167,GradientBoosting,0.841209,0.822527,0.732499,150.0,5.0,0.05
103,dataset167,GradientBoosting,0.83594,0.822196,0.549917,150.0,5.0,0.01
83,dataset167,GradientBoosting,0.832236,0.821865,0.918732,100.0,6.0,0.05
105,dataset167,GradientBoosting,0.832236,0.821865,0.456058,150.0,6.0,0.005
120,dataset167,GradientBoosting,0.836681,0.821865,0.215237,200.0,3.0,0.005


In [47]:
expand_parameters(df.sort_values(['test_score'], ascending=[False]).head(1))

Unnamed: 0,feature_name,model,train_score,test_score,fit_time,n_estimators,max_depth,learning_rate
48,dataset166,GradientBoosting,0.839069,0.822856,0.315404,200.0,3.0,0.005


### XGBoosted tree

In [50]:
# test for checking time for a rum
features = lst_datasets[166]

params = {'max_depth': np.arange(3,11),
          'eta': [0.001, 0.01, 0.1]}
grid = GridSearchCV(XGBClassifier(n_estimators=200), param_grid = params, 
                    cv = 5, scoring= 'accuracy', n_jobs = 3)

time, num_tests = time_gridsearch(grid, features, X_train, y_train)
print(f'Num features: {len(features)} Num comb hyperparam: {num_tests} Execution time: {time:.2f}')

Num features: 14 Num comb hyperparam: 24 Execution time: 5.81


In [84]:
# combinations of model to check (algorithm + parameters)
dict_datasets = {'dataset166': lst_datasets[166],
                 'dataset167': lst_datasets[167],
                 'dataset168': lst_datasets[168]
}
experiments = {
    'model': ['XGBoost'],
    'algorithms': [XGBClassifier()],
    'parameters': [{'n_estimators': [50, 100, 150],
                    'max_depth': np.arange(2, 9),
                    'eta': [0.01, 0.05, 0.1, 0.3, 0.5]}]
}
results = run_experiments(dict_datasets, experiments, X_train, y_train, scoring = 'accuracy', n_jobs = 3)

Datasets analysis: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:58<00:00, 19.56s/it]


In [85]:
# save results in cache
cache_data = dict(results_gridsearch=results)
cache_file = "resultGridSearchXGBoostCryoSleep.pkl"
cache_dir = "cache"
with open(os.path.join(cache_dir, cache_file), "wb") as f:
        pickle.dump(cache_data, f)
print("Wrote preprocessed data to cache file:", cache_file)

Wrote preprocessed data to cache file: resultGridSearchXGBoostCryoSleep.pkl


In [86]:
cache_file = "resultGridSearchXGBoostCryoSleep.pkl"
cache_dir = "cache"
try:
    with open(os.path.join(cache_dir, cache_file), "rb") as f:
        cache_data = pickle.load(f)
    print("Read preprocessed data from cache file:", cache_file)
except:
    pass
results = cache_data['results_gridsearch']

Read preprocessed data from cache file: resultGridSearchXGBoostCryoSleep.pkl


In [87]:
df = pd.DataFrame(results).drop(['features'],axis=1)
df_max = df.sort_values(['feature_name','test_score'], ascending=[True,False]).groupby('feature_name').head(5)
expand_parameters(df_max)

Unnamed: 0,feature_name,model,train_score,test_score,fit_time,n_estimators,max_depth,eta
30,dataset166,XGBoost,0.835282,0.827795,0.03303,50.0,8.0,0.01
25,dataset166,XGBoost,0.833718,0.825487,0.042222,50.0,7.0,0.01
28,dataset166,XGBoost,0.83734,0.824833,0.048502,50.0,7.0,0.3
48,dataset166,XGBoost,0.838163,0.824174,0.031075,100.0,4.0,0.3
17,dataset166,XGBoost,0.836682,0.824172,0.100046,50.0,5.0,0.1
130,dataset167,XGBoost,0.83273,0.827794,0.040976,50.0,7.0,0.01
135,dataset167,XGBoost,0.834129,0.827135,0.032551,50.0,8.0,0.01
119,dataset167,XGBoost,0.830918,0.826477,0.106384,50.0,4.0,0.5
153,dataset167,XGBoost,0.835776,0.825163,0.027507,100.0,4.0,0.3
116,dataset167,XGBoost,0.827132,0.824501,0.074843,50.0,4.0,0.05


In [88]:
expand_parameters(df.sort_values(['test_score'], ascending=[False]).head(1))

Unnamed: 0,feature_name,model,train_score,test_score,fit_time,n_estimators,max_depth,eta
30,dataset166,XGBoost,0.835282,0.827795,0.03303,50.0,8.0,0.01


### Support vector machine

In [56]:
# test for checking time for a rum
features = lst_datasets[166]

params = {'kernel': ['linear','rbf','sigmoid'],
          'gamma': np.logspace(-1,-3,5)}
grid = GridSearchCV(SVC(), param_grid = params, 
                    cv = 5, scoring= 'accuracy', n_jobs = 3)

time, num_tests = time_gridsearch(grid, features, X_train, y_train)
print(f'Num features: {len(features)} Num comb hyperparam: {num_tests} Execution time: {time:.2f}')

Num features: 14 Num comb hyperparam: 15 Execution time: 4.54


In [57]:
# combinations of model to check (algorithm + parameters)
dict_datasets = {'dataset166': lst_datasets[166],
                 'dataset167': lst_datasets[167],
                 'dataset168': lst_datasets[168]
}
experiments = {
    'model': ['SVCnopoly','SVCpoly'],
    'algorithms': [SVC(),SVC(kernel='poly')],
    'parameters': [{'kernel':['linear', 'rbf', 'sigmoid'],
                    'gamma': np.logspace(-1,-4,10)},
                   {'degree': np.arange(3,9)}]
}
results = run_experiments(dict_datasets, experiments, X_train, y_train, scoring = 'accuracy', n_jobs = 3)

Datasets analysis: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [01:47<00:00, 35.69s/it]


In [58]:
# save results in cache
cache_data = dict(results_gridsearch=results)
cache_file = "resultGridSearchSVMCryoSleep.pkl"
cache_dir = "cache"
with open(os.path.join(cache_dir, cache_file), "wb") as f:
        pickle.dump(cache_data, f)
print("Wrote preprocessed data to cache file:", cache_file)

Wrote preprocessed data to cache file: resultGridSearchSVMCryoSleep.pkl


In [59]:
cache_file = "resultGridSearchSVMCryoSleep.pkl"
cache_dir = "cache"
try:
    with open(os.path.join(cache_dir, cache_file), "rb") as f:
        cache_data = pickle.load(f)
    print("Read preprocessed data from cache file:", cache_file)
except:
    pass
results = cache_data['results_gridsearch']

Read preprocessed data from cache file: resultGridSearchSVMCryoSleep.pkl


In [60]:
df = pd.DataFrame(results).drop(['features'],axis=1)
df_max = df.sort_values(['feature_name','test_score'], ascending=[True,False]).groupby('feature_name').head(5)
expand_parameters(df_max)

Unnamed: 0,feature_name,model,train_score,test_score,fit_time,kernel,gamma,degree
0,dataset166,SVCnopoly,0.817583,0.817583,0.114222,linear,0.1,
1,dataset166,SVCnopoly,0.817583,0.817583,0.165289,linear,0.046416,
3,dataset166,SVCnopoly,0.817583,0.817583,0.087748,linear,0.01,
4,dataset166,SVCnopoly,0.817583,0.817583,0.153242,linear,0.004642,
5,dataset166,SVCnopoly,0.817583,0.817583,0.218071,linear,0.002154,
36,dataset167,SVCnopoly,0.817583,0.817583,0.080103,linear,0.1,
37,dataset167,SVCnopoly,0.817583,0.817583,0.145503,linear,0.046416,
39,dataset167,SVCnopoly,0.817583,0.817583,0.080087,linear,0.01,
40,dataset167,SVCnopoly,0.817583,0.817583,0.130686,linear,0.004642,
41,dataset167,SVCnopoly,0.817583,0.817583,0.21752,linear,0.002154,


In [61]:
expand_parameters(df.sort_values(['test_score'], ascending=[False]).head(1))

Unnamed: 0,feature_name,model,train_score,test_score,fit_time,degree
103,dataset168,SVCpoly,0.818489,0.817913,0.392806,4


Best score (0.831) is achieved using a Decision Tree (max_depth 21, min_samples_leaf 6, min_samples_split 3, splitter random) 
  
Features use in this model: 'Age', 'Side_S', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T', 'HomePlanet_Earth', 'HomePlanet_Europa'

### Train model

Model with the best score is trained

In [93]:
features = lst_datasets[167]
X_train_sel = X_train[features]
model = DecisionTreeClassifier(max_depth=21, min_samples_leaf=6, min_samples_split=3, splitter='random')
model.fit(X_train_sel, y_train)
y_pred = model.predict(X_train_sel)
y_train_pred = pd.Series(y_pred, name='Pred')
y_train.index = range(y_train.shape[0])
print(pd.crosstab(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))
# save the model
with open('models/DecisionTreeCryoSleepModel.pkl', 'wb') as file:
    pickle.dump(model, file)

Pred         False  True 
Transported              
False           58    496
True            39   2444
              precision    recall  f1-score   support

       False       0.60      0.10      0.18       554
        True       0.83      0.98      0.90      2483

    accuracy                           0.82      3037
   macro avg       0.71      0.54      0.54      3037
weighted avg       0.79      0.82      0.77      3037



### Test model

Model is tested after processing the data test with the same pipeline used for train data

In [None]:
# TODO load data
df_test = pd.read_csv('data/test.csv')
df_test.head(5)

In [10]:
# TODO same processing to test data
preprocessor = cache_data['preprocessor']
encoder = cache_data['encoder']
scaler_num = cache_data['scaler']
df_proc_test = process_features(df_test, preprocessor, encoder, scaler_num)

In [None]:
df_proc_test.head(4)

Prediction for data test (score = 0.80289)

In [23]:
X_test = df_proc_test[features]
y_pred = model.predict(X_test)
y_test_pred = list(map(lambda x: True if x == 1 else False, y_pred))
test_result = pd.DataFrame({'PassengerId': df_proc_test['PassengerId'], 'Transported': y_test_pred})
test_result.to_csv('test_result.csv', index=False)