In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# !conda install matplotlib
# !conda install numpy
# !conda install pandas
# !conda install scikit-learn
# !conda install tensorflow
# !pip install fasteda
# !conda install Jinja2 --y

In [3]:
import numpy as np
import pandas as pd

import utils.data_manipulation as dm
import utils.validation as val
from utils.settings import *
import models.neural_network as nn

## **Data**

#### Load Data

In [4]:
df_X_train, df_X_test, df_y_train, df_y_test = dm.load_data(KAGGLE)

print(f"[X-train]: {df_X_train.shape}")
print(f"[y-train]: {df_y_train.shape}")
print(f"[X-test]: {df_X_test.shape}")
print(f"[y-test]: {df_y_test.shape}")

[X-train]: (7905, 18)
[y-train]: (7905,)
[X-test]: (5271, 18)
[y-test]: (5271, 4)


#### Data Scaling

In [5]:
if CATEGORICAL_TO_NUMERICAL:
    # train subset
    df_X_train = dm.categorical_to_numerical(
        df_X_train, categorical_features, transformer="ordinal"
    ).copy(deep=True)

    # test subset
    df_X_test = dm.categorical_to_numerical(
        df_X_test, categorical_features, transformer="ordinal"
    ).copy(deep=True)

if ENCODE_LABEL:
    # this ensures that the label order is the same one as in sample_submission
    encoded_label = dm.encode_label(df_y_train.values, label_order)
    df_y_train = pd.DataFrame(encoded_label, columns=["Status"])

## **Model - Train, Validate, GridsearchCV**

In [6]:
# class BaseModel:
#     def fit(self, X, y, **kwargs):
#         """Fit the model to the data. Specific arguments can be passed in kwargs."""
#         raise NotImplementedError("This method should be implemented by subclasses.")

#     def predict(self, X):
#         """Make predictions using the fitted model."""
#         raise NotImplementedError("This method should be implemented by subclasses.")

In [7]:
# class ScikitLearnModel:
#     def __init__(self, model):
#         self.model = model

#     def fit(self, X, y, **kwargs):
#         # Filter for scikit-learn-specific arguments or ignore all kwargs
#         sklearn_args = {k: v for k, v in kwargs.items() if k in ["sample_weight"]}
#         self.model.fit(X, y, **sklearn_args)

#     def predict(self, X):
#         return self.model.predict(X)

## **Cross-Validation: Stratified K-Fold**

In [8]:
# Option to do standard procedure: train, dev, test (sequencially by ID order) (1 run)
# Option to do k-fold (stratified): train, dev, test (order via k-fold procedure) (2+ runs)

# train - fit model
# dev - hyper-parameter tuning
# test - evaluate log-loss / accuracy / other metric

In [9]:
import itertools

gridsearch_kwargs = {
    "learning_rate": [0.01, 0.001],
    "epochs": [30, 50],
    "batch_size": [50, 70],
    "verbose": [0],
}

# Create a list of keys and a list of lists of values
keys = list(gridsearch_kwargs.keys())
values = list(gridsearch_kwargs.values())

# Generate all combinations of the parameter values
all_combinations = itertools.product(*values)

# Print each combination as a formatted dictionary
for combination in all_combinations:
    kwargs = dict(zip(keys, combination))
    print(kwargs)

    model = nn.build_models()[0]
    model = nn.SoftmaxRegressionNN(model, **kwargs)

    cv = val.CrossValidation(
        df_X_train,
        df_y_train["Status"],
        numerical_scale=NUMERICAL_SCALING,
        numerical_features=numerical_features,
    )

    history = cv.run(model, "standard", shuffle=True, random_state=None, **kwargs)

    print(
        "[Validation Set] Average Accuracy: %.2f" % (history["avg_accuracy"] * 100), "%"
    )
    print("[Validation Set] Average Log-loss: %.2f \n" % history["avg_log_loss"])

{'learning_rate': 0.01, 'epochs': 30, 'batch_size': 50, 'verbose': 0}


2024-05-14 18:45:04.262624: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


[Validation Set] Average Accuracy: 78.67 %
[Validation Set] Average Log-loss: 0.58 

{'learning_rate': 0.01, 'epochs': 30, 'batch_size': 70, 'verbose': 0}
[Validation Set] Average Accuracy: 79.85 %
[Validation Set] Average Log-loss: 0.55 

{'learning_rate': 0.01, 'epochs': 50, 'batch_size': 50, 'verbose': 0}
[Validation Set] Average Accuracy: 78.92 %
[Validation Set] Average Log-loss: 0.62 

{'learning_rate': 0.01, 'epochs': 50, 'batch_size': 70, 'verbose': 0}
[Validation Set] Average Accuracy: 80.35 %
[Validation Set] Average Log-loss: 0.55 

{'learning_rate': 0.001, 'epochs': 30, 'batch_size': 50, 'verbose': 0}
[Validation Set] Average Accuracy: 80.40 %
[Validation Set] Average Log-loss: 0.51 

{'learning_rate': 0.001, 'epochs': 30, 'batch_size': 70, 'verbose': 0}
[Validation Set] Average Accuracy: 80.65 %
[Validation Set] Average Log-loss: 0.51 

{'learning_rate': 0.001, 'epochs': 50, 'batch_size': 50, 'verbose': 0}
[Validation Set] Average Accuracy: 80.40 %
[Validation Set] Average

In [None]:
model

## **Predict Test Sample**

In [None]:
# y_test_proba = models["log_reg"].predict_proba(X_test)

# Neural network
prediction = model.predict(X_test)
y_test_pred_proba = tf.nn.softmax(prediction)


# Prepare submission DataFrame
df_y_test_pred_proba = pd.DataFrame(
    y_test_pred_proba, columns=["Status_C", "Status_CL", "Status_D"]
)

df_y_test_pred_proba = pd.concat([df_y_test["id"], df_y_test_pred_proba], axis=1)

display(df_y_test_pred_proba.head())
display(df_y_test_pred_proba.tail())

## **Submission**

In [None]:
if KAGGLE:
    # Save submission to a CSV file
    df_y_test_pred_proba.to_csv("submission.csv", index=False)

**NEXT STEPS**