In [1]:
import os
import json
import logging
from notebooks.gscv_configs import gscv_cfg_dt, gscv_cfg_gb, gscv_cfg_knn, gscv_cfg_mlp, gscv_cfg_rf, gscv_cfg_sv, gscv_cfg_xgb, gscv_cfg_nb

cfgs = [gscv_cfg_dt, gscv_cfg_gb, gscv_cfg_knn, gscv_cfg_mlp, gscv_cfg_rf, gscv_cfg_sv, gscv_cfg_xgb, gscv_cfg_nb]

best_estimator_and_params = None
best_score = 0
for cfg in cfgs:
    dir = f"gscv/{cfg.name}"
    files = sorted([f for f in os.listdir(dir) if (f.endswith(".json"))], reverse=True)
    try:
        file = files[0]
        with open(f"{dir}/{file}") as f:
            gscv = json.load(f)
        
        print("=============================================================================================================================================")
        print(f"{cfg.estimator}")
        print("---------------------------------------------------------------------------------------------------------------------------------------------")
        print(f"F1-Score: {gscv['best_score_']}")
        print(f"Parameters: {gscv['best_params_']}")
        
        if gscv['best_score_'] > best_score:
            best_estimator_and_params = (cfg.estimator, gscv['best_params_'])
            best_score = gscv['best_score_']

    except IndexError:
        logging.warning(f"There are no results for {cfg.name}")
    

DecisionTreeClassifier()
---------------------------------------------------------------------------------------------------------------------------------------------
F1-Score: 0.75395006075033
Parameters: {'criterion': 'log_loss', 'max_depth': 25, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2}
GradientBoostingClassifier()
---------------------------------------------------------------------------------------------------------------------------------------------
F1-Score: 0.7782794565573601
Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'max_features': 'log2', 'n_estimators': 50}
KNeighborsClassifier()
---------------------------------------------------------------------------------------------------------------------------------------------
F1-Score: 0.7525102820410783
Parameters: {'algorithm': 'ball_tree', 'n_neighbors': 10, 'weights': 'distance'}
MLPClassifier()
----------------------------------------------------------------------------------------------

In [2]:
estimator = best_estimator_and_params[0]
param_grid = best_estimator_and_params[1]

print("=============================================================================================================================================")
print("BEST MODEL (w.r.t. F1)")
print("---------------------------------------------------------------------------------------------------------------------------------------------")
print(f"Estimator: {estimator}")
print(f"Parameters: {param_grid}")

BEST MODEL (w.r.t. F1)
---------------------------------------------------------------------------------------------------------------------------------------------
Estimator: RandomForestClassifier()
Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 10}


In [4]:
import pandas as pd
import os

# Prepare Submission

train = pd.read_csv(f"data/preprocessed_20221117.csv", index_col=0)
X_train = train.drop(["Survived"], axis=1)
y_train = train["Survived"]

# TODO: THIS IS WRONG AND NEEDS TO BE CHANGED (test data not yet available)
X_test = X_train # pd.read_csv(f"data/test.csv", index_col=0)

estimator.set_params(**param_grid)
estimator.fit(X_train, y_train)

yhat = estimator.predict(X_train)

subm = pd.DataFrame(yhat, columns=["Survived"], index=X_test.index)
subm.index.name="PassengerId"

# subm.to_csv("submission/submission.csv")
