In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# !conda install matplotlib
# !conda install numpy
# !conda install pandas
# !conda install scikit-learn
# !conda install tensorflow
# !pip install fasteda
# !conda install Jinja2 --y

In [3]:
import numpy as np
import pandas as pd

from utils.settings import (
    KAGGLE,
    CATEGORICAL_TO_NUMERICAL,
    ENCODE_LABEL,
    NUMERICAL_SCALING,
    numerical_features,
    categorical_features,
    label_order,
)
from utils.data import (
    load_data,
    categorical_to_numerical,
    encode_label,
    NumericalScaling,
    get_numerical_and_categorical_indexes,
)
from utils.validation import grid_search_cv

from models.neural_network import SoftmaxRegressionNN, build_models

## **Data**

#### Load Data

In [4]:
df_X_train, df_X_test, df_y_train, df_y_test = load_data(KAGGLE)

print(f"[X-train]: {df_X_train.shape}")
print(f"[y-train]: {df_y_train.shape}")
print(f"[X-test]: {df_X_test.shape}")
print(f"[y-test]: {df_y_test.shape}")

[X-train]: (7905, 18)
[y-train]: (7905,)
[X-test]: (5271, 18)
[y-test]: (5271, 4)


#### Data Scaling

In [5]:
if CATEGORICAL_TO_NUMERICAL:
    # train subset
    df_X_train = categorical_to_numerical(
        df_X_train, categorical_features, transformer="ordinal"
    ).copy(deep=True)

    # test subset
    df_X_test = categorical_to_numerical(
        df_X_test, categorical_features, transformer="ordinal"
    ).copy(deep=True)

if ENCODE_LABEL:
    # this ensures that the label order is the same one as in sample_submission
    encoded_label = encode_label(df_y_train.values, label_order)
    df_y_train = pd.DataFrame(encoded_label, columns=["Status"])

## **Model - Train, Validate, GridsearchCV**

In [6]:
# class ScikitLearnModel:
#     def __init__(self, model):
#         self.model = model

#     def fit(self, X, y, **kwargs):
#         # Filter for scikit-learn-specific arguments or ignore all kwargs
#         sklearn_args = {k: v for k, v in kwargs.items() if k in ["sample_weight"]}
#         self.model.fit(X, y, **sklearn_args)

#     def predict(self, X):
#         return self.model.predict(X)

## **Cross-Validation: Stratified K-Fold**

In [7]:
# Option to do standard procedure: train, dev, test (sequencially by ID order) (1 run)
# Option to do k-fold (stratified): train, dev, test (order via k-fold procedure) (2+ runs)

# train - fit model
# dev - hyper-parameter tuning
# test - evaluate log-loss / accuracy / other metric

In [9]:
model = SoftmaxRegressionNN

grid_search_parameters = {
    "sequential": build_models(),
    "learning_rate": [0.01, 0.001],
    "epochs": [30, 50],
    "batch_size": [50, 70],
    "verbose": [0],
}

cross_validation_split_method = "standard"

opt_parameters, opt_log_loss, all_parameters, history = grid_search_cv(
    df_X_train,
    df_y_train,
    model,
    grid_search_parameters,
    cross_validation_split_method,
    NUMERICAL_SCALING,
    numerical_features,
    shuffle=True,
    random_state=42,
    verbose=False
)

2024-05-22 12:54:59.358603: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [10]:
print(opt_parameters)
print(opt_log_loss)

{'sequential': <keras.engine.sequential.Sequential object at 0x16dbe1990>, 'learning_rate': 0.01, 'epochs': 30, 'batch_size': 50, 'verbose': 0}
0.5442625646653187


## **Predict Test Sample**

In [11]:
# numerical scaling - train / test sets
numerical_indexes, categorical_indexes = get_numerical_and_categorical_indexes(
    df_X_train, numerical_features
)

numerical_scaling = NumericalScaling(numerical_indexes, categorical_indexes)

X_train = numerical_scaling.run(df_X_train.values, use_saved_transformer=False)
X_test = numerical_scaling.run(df_X_test.values, use_saved_transformer=True)
y_train = df_y_train["Status"].values

In [12]:
model = SoftmaxRegressionNN(**opt_parameters)

model.fit(X_train, y_train, **opt_parameters)

y_test_pred, y_test_pred_proba = model.predict(X_test, **opt_parameters)

print(y_test_pred, y_test_pred_proba)

[0 0 2 ... 0 0 2] [[6.1077958e-01 2.3893319e-02 3.6532712e-01]
 [6.9068569e-01 7.9968132e-02 2.2934611e-01]
 [4.6242765e-04 3.1308970e-01 6.8644780e-01]
 ...
 [9.2102486e-01 4.4687484e-05 7.8930393e-02]
 [9.9478430e-01 4.8694951e-03 3.4629548e-04]
 [1.4669639e-01 6.1413399e-03 8.4716231e-01]]


In [13]:
# Prepare submission DataFrame
df_y_test_pred_proba = pd.DataFrame(
    y_test_pred_proba, columns=["Status_C", "Status_CL", "Status_D"]
)

df_y_test_pred_proba = pd.concat([df_y_test["id"], df_y_test_pred_proba], axis=1)

display(df_y_test_pred_proba.head())
display(df_y_test_pred_proba.tail())

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,7905,0.61078,0.02389332,0.365327
1,7906,0.690686,0.07996813,0.229346
2,7907,0.000462,0.3130897,0.686448
3,7908,0.945421,0.001997559,0.052581
4,7909,0.534442,5.169245e-08,0.465558


Unnamed: 0,id,Status_C,Status_CL,Status_D
5266,13171,0.820515,0.044658,0.134827
5267,13172,0.955916,1e-05,0.044075
5268,13173,0.921025,4.5e-05,0.07893
5269,13174,0.994784,0.004869,0.000346
5270,13175,0.146696,0.006141,0.847162


## **Submission**

In [None]:
if KAGGLE:
    # Save submission to a CSV file
    df_y_test_pred_proba.to_csv("submission.csv", index=False)