In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# !conda install matplotlib
# !conda install numpy
# !conda install pandas
# !conda install scikit-learn
# !conda install tensorflow
# !pip install fasteda
# !conda install Jinja2 --y

In [None]:
import numpy as np
import pandas as pd

from utils.settings import (
    KAGGLE,
    CATEGORICAL_TO_NUMERICAL,
    ENCODE_LABEL,
    NUMERICAL_SCALING,
    numerical_features,
    categorical_features,
    label_order,
)
from utils.data import (
    load_data,
    categorical_to_numerical,
    encode_label,
    NumericalScaling,
    get_numerical_and_categorical_indexes,
)
from utils.validation import grid_search_cv

from models.neural_network import SoftmaxRegressionNN, build_models

## **Data**

#### Load Data

In [None]:
df_X_train, df_X_test, df_y_train, df_y_test = load_data(KAGGLE)

print(f"[X-train]: {df_X_train.shape}")
print(f"[y-train]: {df_y_train.shape}")
print(f"[X-test]: {df_X_test.shape}")
print(f"[y-test]: {df_y_test.shape}")

#### Data Scaling

In [None]:
if CATEGORICAL_TO_NUMERICAL:
    # train subset
    df_X_train = categorical_to_numerical(
        df_X_train, categorical_features, transformer="ordinal"
    ).copy(deep=True)

    # test subset
    df_X_test = categorical_to_numerical(
        df_X_test, categorical_features, transformer="ordinal"
    ).copy(deep=True)

if ENCODE_LABEL:
    # this ensures that the label order is the same one as in sample_submission
    encoded_label = encode_label(df_y_train.values, label_order)
    df_y_train = pd.DataFrame(encoded_label, columns=["Status"])

## **Model - Train, Validate, GridsearchCV**

In [None]:
# class ScikitLearnModel:
#     def __init__(self, model):
#         self.model = model

#     def fit(self, X, y, **kwargs):
#         # Filter for scikit-learn-specific arguments or ignore all kwargs
#         sklearn_args = {k: v for k, v in kwargs.items() if k in ["sample_weight"]}
#         self.model.fit(X, y, **sklearn_args)

#     def predict(self, X):
#         return self.model.predict(X)

## **Cross-Validation**

In [None]:
# Option to do standard procedure: train, dev, test (sequencially by ID order) (1 run)
# Option to do k-fold (stratified): train, dev, test (order via k-fold procedure) (2+ runs)

# train - fit model
# dev - hyper-parameter tuning
# test - evaluate log-loss / accuracy / other metric

In [None]:
model = SoftmaxRegressionNN

grid_search_parameters = {
    "sequential": build_models(),
    "learning_rate": [0.01, 0.001],
    "epochs": [30, 50],
    "batch_size": [50, 70],
    "verbose": [0],
}

cross_validation_split_method = "stratified_kfold"

opt_parameters, opt_log_loss, all_parameters, history = grid_search_cv(
    df_X_train,
    df_y_train,
    model,
    grid_search_parameters,
    cross_validation_split_method,
    NUMERICAL_SCALING,
    numerical_features,
    shuffle=True,
    random_state=42,
    verbose=False,
)

In [None]:
print(opt_parameters)
print(opt_log_loss)

## **Predict Test Sample**

In [None]:
opt_parameters = {
    "sequential": build_models()[0],
    "learning_rate": 0.001,
    "epochs": 30,
    "batch_size": 70,
    "verbose": 0,
}

In [None]:
# numerical scaling - train / test sets
numerical_indexes, categorical_indexes = get_numerical_and_categorical_indexes(
    df_X_train, numerical_features
)

numerical_scaling = NumericalScaling(numerical_indexes, categorical_indexes)

X_train = numerical_scaling.run(df_X_train.values, use_saved_transformer=False)
X_test = numerical_scaling.run(df_X_test.values, use_saved_transformer=True)
y_train = df_y_train["Status"].values

In [None]:
model = SoftmaxRegressionNN(**opt_parameters)

model.fit(X_train, y_train, **opt_parameters)

y_test_pred, y_test_pred_proba = model.predict(X_test, **opt_parameters)

print(y_test_pred, y_test_pred_proba)

In [None]:
# Prepare submission DataFrame
df_y_test_pred_proba = pd.DataFrame(
    y_test_pred_proba, columns=["Status_C", "Status_CL", "Status_D"]
)

df_y_test_pred_proba = pd.concat([df_y_test["id"], df_y_test_pred_proba], axis=1)

display(df_y_test_pred_proba.head())
display(df_y_test_pred_proba.tail())

## **Submission**

In [None]:
if KAGGLE:
    # Save submission to a CSV file
    df_y_test_pred_proba.to_csv("submission.csv", index=False)