## Model Training

### Environment setting

In [19]:
import pickle
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

### Read processed data

In [20]:
cache_file = "procData.pkl"
cache_dir = "cache"
try:
    with open(os.path.join(cache_dir, cache_file), "rb") as f:
        cache_data = pickle.load(f)
    print("Read preprocessed data from cache file:", cache_file)
except:
    pass

Read preprocessed data from cache file: procData.pkl


In [21]:
df = cache_data['proc_data']
df.head(5).T

Unnamed: 0,0,1,2,3,4
PassengerId,0001_01,0002_01,0003_01,0003_02,0004_01
HomePlanet,Europa,Earth,Europa,Europa,Earth
CryoSleep,False,False,False,False,False
Cabin,B/0/P,F/0/S,A/0/S,A/0/S,F/1/S
Destination,TRAPPIST-1e,TRAPPIST-1e,TRAPPIST-1e,TRAPPIST-1e,TRAPPIST-1e
Age,0.711945,-0.334037,2.036857,0.293552,-0.891895
VIP,False,False,True,False,False
RoomService,-0.333105,-0.168073,-0.268001,-0.333105,0.125652
FoodCourt,-0.281027,-0.275387,1.959998,0.52301,-0.237159
ShoppingMall,-0.283579,-0.241771,-0.283579,0.336851,-0.031059


In [22]:
X_train = df.drop('Transported', axis=1)
y_train = df['Transported']

### Basic initial models

Initially, we will evaluate two basic models, each relying on a single predictor variable strongly linked to the target: 'CryoSleep' and 'log10TotalExpenses1'. Predictions are made directly from these inputs and metrics for the classification are calculated.

#### 'CryoSleep' feature

This 'model' achieves an accuracy of 0.72

In [23]:
le = LabelEncoder()
y_true = le.fit_transform(df['Transported'])  
y_pred = le.fit_transform(df['CryoSleep'])   

print(pd.crosstab(df['Transported'], df['CryoSleep']))
print(classification_report(y_true, y_pred))

CryoSleep    False  True 
Transported              
False         3761    554
True          1895   2483
              precision    recall  f1-score   support

           0       0.66      0.87      0.75      4315
           1       0.82      0.57      0.67      4378

    accuracy                           0.72      8693
   macro avg       0.74      0.72      0.71      8693
weighted avg       0.74      0.72      0.71      8693



#### 'log10_TotalExpenses1' feature

This 'model' uses a feature calculated from 'TotalExpenses' feature. Samples are classified into two groups based on whether the log10 of the TotalExpenses variable is greater than or less than 1. This 'model' achieves an accuracy of 0.74.

In [25]:
le = LabelEncoder()
y_true = le.fit_transform(df['Transported'])  
y_pred = le.fit_transform(df['log10_TotalExpenses1'])   

print(pd.crosstab(df['Transported'], df['log10_TotalExpenses1']))
print(classification_report(y_true, y_pred))

log10_TotalExpenses1   0.0   1.0
Transported                     
False                 3530   785
True                  1505  2873
              precision    recall  f1-score   support

           0       0.70      0.82      0.76      4315
           1       0.79      0.66      0.72      4378

    accuracy                           0.74      8693
   macro avg       0.74      0.74      0.74      8693
weighted avg       0.74      0.74      0.73      8693



### Decision Trees

In [11]:
# TODO Video Imputation Kaggle
# TODO Read data
# TODO Run DecisionTree
# TODO Features + Parameters: max_depth
# TODO Random Forest
# TODO Feature importance
# TODO Boosted Tree sklearn
# TODO XGBoost
# TODO HistGradienteBoostingClassifier

In [29]:
# combinations of model to check (algorithm + parameters)
experiments = {
    'model': ['KNN', 'Ridge', 'DecisionTree', 'BaggingRidge', 'RandomForest']
    'algorithms': [KNeighborsClassifier(), RidgeClassifier(), 
                    DecisionTreeClassifier(criterion='gini'),
                    BaggingClassifier(estimator=RidgeClassifier())
                    RandomForestClassifier()],
    'parameters': [{'n_neighbors': [8, 10, 12, 14], 'weights': ['uniform', 'distance']},
                    {'alpha': [1.0]},
                    {'max_depth': [5, 10, 15], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], splitter['best', 'random']},
                    {'n_estimators': [10, 30, 50], 'max_features': [1.0, 0.7, 0.5], 'max_samples': [1.0, 0.9, 0.8]}
                    {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 15], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}
                  ]
}

#experiments = {
#    'algorithms': [KNeighborsClassifier(), RidgeClassifier()],
#    'parameters': [{'n_neighbors': [14, 18], 'weights': ['uniform', 'distance']},
#                    {'alpha': [1.0]}
#                  ]
#}
#experiments = {
#    'model': ['KNN', 'Ridge'],
#    'algorithm': [KNeighborsClassifier(), RidgeClassifier()],
#    'parameters': [{'n_neighbors': [14, 18], 'weights': ['uniform', 'distance']},
#                    {'alpha': [1.0]}
#                  ]
#}
# run experiments: features + algorithm + parameters
results = []
for dp, features in dataproc.items():
    X_train_sel = X_train[features]    
    #for index, algorithm in enumerate(experiments['algorithms']):
    for index, model in enumerate(experiments['model']):
        params = experiments['parameters'][index]
        algorithm = experiments['algorithm'][index]
        grid = GridSearchCV(algorithm, param_grid = params, cv = 5, 
                            scoring= 'accuracy', return_train_score = True)
        grid.fit(X_train_sel, y_train)

        lst = list(grid.get_params()['param_grid'].values())
        #for params, mse in zip(itertools.product(*lst),grid.cv_results_['mean_test_score']):
        for params, test_score, train_score, fit_time in zip(itertools.product(*lst),
                                                   grid.cv_results_['mean_test_score'], 
                                                   grid.cv_results_['mean_train_score'],
                                                   grid.cv_results_['mean_fit_time']):
            params_dict = {param: value for param, value in zip(grid.get_params()['param_grid'].keys(), params)}
            result = {'feature_name': dp,'features': features, 'model': model, 'parameters': params_dict,
                  'train_score': train_score, 'test_score': test_score, 'fit_time': fit_time}
            results.append(result)


In [32]:
pd.DataFrame(results).drop(['features'],axis=1)
max_indices = pd.DataFrame(results).drop(['features'],axis=1).groupby('model')['test_score'].idxmax()
pd.DataFrame(results).drop(['features'],axis=1).loc[max_indices]


Unnamed: 0,feature_name,model,parameters,train_score,test_score,fit_time
2,all_expenses,KNN,"{'n_neighbors': 18, 'weights': 'uniform'}",0.808754,0.780285,0.004548
4,all_expenses,Ridge,{'alpha': 1.0},0.769642,0.767745,0.007219


In [57]:
best_model = pd.DataFrame(results).sort_values('acc_val', ascending=False).iloc[0,:].to_dict()
for key, value in best_model.items():
    print(f'{key}: {value}')

features: ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Destination_55 Cancri e', 'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars', 'CryoSleep_True']
algorithm: DecisionTreeClassifier(splitter='random')
parameters: {'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 4}
acc_val: 0.7839655609383592
