In [1]:
import os
import json
import logging
from gscv_configs import gscv_cfg_dt, gscv_cfg_gb, gscv_cfg_knn, gscv_cfg_mlp, gscv_cfg_rf, gscv_cfg_sv, gscv_cfg_xgb, gscv_cfg_nb
from sklearn.metrics import f1_score, accuracy_score
from utils import retrieve_latest_train_test
from prettytable import PrettyTable


cfgs = [gscv_cfg_dt, gscv_cfg_gb, gscv_cfg_knn, gscv_cfg_mlp, gscv_cfg_rf, gscv_cfg_sv, gscv_cfg_xgb, gscv_cfg_nb]

train, test = retrieve_latest_train_test()

# TRAIN
X_train = train.drop(["Survived"], axis=1)
y_train=train["Survived"]

# TEST
X_test=test.drop(["Survived"], axis=1)
y_test=test["Survived"]


output = PrettyTable()
output.field_names = ["Name", "Estimator", "F1", "Accuracy"]
for cfg in cfgs:
    dir = f"../gscv/{cfg.name}"
    files = sorted([f for f in os.listdir(dir) if (f.endswith(".json"))], reverse=True)
    try:
        file = files[0]
        with open(f"{dir}/{file}") as f:
            gscv = json.load(f)

        estimator = cfg.estimator
        param_grid = gscv['best_params_']

        estimator.set_params(**param_grid)
        estimator.fit(X_train, y_train)

        yhat = estimator.predict(X_test)

        output.add_row([cfg.name, cfg.estimator, round(f1_score(yhat, y_test), 4), round(accuracy_score(yhat, y_test), 4)])

    except IndexError:
        logging.warning(f"There are no results for {cfg.name}")

output.sortby = 'F1'
print(output)

+-----------------------+---------------------------------------------------------------------------------+--------+----------+
|          Name         |                                    Estimator                                    |   F1   | Accuracy |
+-----------------------+---------------------------------------------------------------------------------+--------+----------+
|     decision_tree     | DecisionTreeClassifier(criterion='log_loss', max_depth=15, max_features='sqrt') | 0.7333 |  0.791   |
|   knearest_neighbor   | KNeighborsClassifier(algorithm='ball_tree', n_neighbors=10, weights='distance') | 0.7465 |  0.7948  |
|        xgboost        |         XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,         | 0.7512 |  0.8022  |
|                       |                 colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.6,  |        |          |
|                       |                     early_stopping_rounds=None, enable_categorical=False,     

In [2]:
import pandas as pd
import os

# Prepare Submission

train = pd.read_csv(f"data/preprocessed_20221117.csv", index_col=0)
X_train = train.drop(["Survived"], axis=1)
y_train = train["Survived"]

# TODO: THIS IS WRONG AND NEEDS TO BE CHANGED (test data not yet available)
X_test = X_train # pd.read_csv(f"data/test.csv", index_col=0)

estimator.set_params(**param_grid)
estimator.fit(X_train, y_train)

yhat = estimator.predict(X_train)

subm = pd.DataFrame(yhat, columns=["Survived"], index=X_test.index)
subm.index.name="PassengerId"

# subm.to_csv("submission/submission.csv")


FileNotFoundError: [Errno 2] No such file or directory: 'data/preprocessed_20221117.csv'