In [1]:
import os
import json
import logging
from gscv_configs import gscv_cfg_dt, gscv_cfg_gb, gscv_cfg_knn, gscv_cfg_mlp, gscv_cfg_rf, gscv_cfg_sv, gscv_cfg_xgb, gscv_cfg_nb, gscv_cfg_lr
from sklearn.metrics import f1_score, accuracy_score
from utils import retrieve_latest_train_test
from prettytable import PrettyTable
import random
import pandas as pd

cfgs = [gscv_cfg_dt, gscv_cfg_gb, gscv_cfg_knn, gscv_cfg_mlp, gscv_cfg_rf, gscv_cfg_xgb, gscv_cfg_nb, gscv_cfg_lr, gscv_cfg_sv]

train, test = retrieve_latest_train_test()

# TRAIN
X_train = train.drop(["Survived"], axis=1)
y_train=train["Survived"]

# TEST
X_test=test.drop(["Survived"], axis=1)
y_test=test["Survived"]


output = PrettyTable()
output.field_names = ["Name", "Estimator", "F1", "Accuracy"]
for cfg in cfgs:
    dir = f"../gscv/{cfg.name}"
    files = sorted([f for f in os.listdir(dir) if (f.endswith(".json"))], reverse=True)
    try:
        file = files[0]
        with open(f"{dir}/{file}") as f:
            gscv = json.load(f)

        random.seed(10)

        estimator = cfg.estimator
        param_grid = gscv['best_params_']

        estimator.set_params(**param_grid)
        estimator.random_state = 500
        estimator.fit(X_train, y_train)

        yhat = estimator.predict(X_test)

        output.add_row([cfg.name, estimator, round(f1_score(yhat, y_test), 4), round(accuracy_score(yhat, y_test), 4)])

        KAGGLE_TRAIN = pd.concat(
            [
                pd.read_csv(f"../data/test_data_20221125.csv", index_col=0),
                pd.read_csv(f"../data/train_data_20221125.csv", index_col=0)
            ], axis=0)

        # TODO: THIS IS WRONG AND NEEDS TO BE CHANGED (test data not yet available)
        org_test = pd.read_csv(f"../data/test.csv", index_col=0)
        KAGGLE_TEST_X = pd.read_csv(f"../data/test_preprocessed_20221125.csv", index_col=0)

        estimator.set_params(**param_grid)
        estimator.fit(KAGGLE_TRAIN.drop(["Survived"], axis=1), KAGGLE_TRAIN["Survived"])

        yhat = estimator.predict(KAGGLE_TEST_X)

        subm = pd.DataFrame(yhat, columns=["Survived"], index=org_test.index)
        subm.index.name="PassengerId"

        subm.to_csv(f"../submission/{cfg.name}_submission.csv")

    except IndexError:
        logging.warning(f"There are no results for {cfg.name}")

output.sortby = 'F1'
print(output)

+-----------------------+---------------------------------------------------------------------------------+--------+----------+
|          Name         |                                    Estimator                                    |   F1   | Accuracy |
+-----------------------+---------------------------------------------------------------------------------+--------+----------+
|   gradient_boosting   | GradientBoostingClassifier(learning_rate=0.3, max_depth=8, max_features='log2', | 0.7349 |  0.7873  |
|                       |                                     n_estimators=23, random_state=500)          |        |          |
|        xgboost        |         XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,         | 0.7404 |  0.7985  |
|                       |                colsample_bylevel=0.2, colsample_bynode=1, colsample_bytree=0.9, |        |          |
|                       |                     early_stopping_rounds=None, enable_categorical=False,     

In [2]:
import pandas as pd
import os

# Prepare Submission

train_train = pd.read_csv(f"../data/test_data_20221125.csv", index_col=0)
train_test = pd.read_csv(f"../data/train_data_20221125.csv", index_col=0)
train = pd.concat([train_train, train_test], axis=0)

X_train = train.drop(["Survived"], axis=1)
y_train = train["Survived"]

# TODO: THIS IS WRONG AND NEEDS TO BE CHANGED (test data not yet available)
org_test = pd.read_csv(f"../data/test.csv", index_col=0)
X_test = pd.read_csv(f"../data/test_preprocessed_20221125.csv", index_col=0)

estimator.set_params(**param_grid)
estimator.fit(X_train, y_train)

yhat = estimator.predict(X_test)

subm = pd.DataFrame(yhat, columns=["Survived"], index=org_test.index)
subm.index.name="PassengerId"

subm.to_csv("../submission/submission.csv")
