In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# !conda install matplotlib
# !conda install numpy
# !conda install pandas
# !conda install scikit-learn
# !conda install tensorflow
# !pip install fasteda
# !conda install Jinja2 --y

In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import train_test_split

from src import load_data, categorical_to_numerical, numerical_scaling, encode_label

In [4]:
# Settings
KAGGLE = False
CATEGORICAL_TO_NUMERICAL = True
ENCODE_LABEL = True
NUMERICAL_SCALING = True

numerical_features = [
    "N_Days",
    "Age",
    "Bilirubin",
    "Cholesterol",
    "Albumin",
    "Copper",
    "Alk_Phos",
    "SGOT",
    "Tryglicerides",
    "Platelets",
    "Prothrombin",
    "Stage",
]

categorical_features = ["Drug", "Sex", "Ascites", "Hepatomegaly", "Edema", "Spiders"]

label_order = ["C", "CL", "D"]

## **Data**

#### Load Data

In [5]:
df_X_train, df_X_test, df_y_train, df_y_test = load_data(KAGGLE)

print(f"[X-train]: {df_X_train.shape}")
print(f"[y-train]: {df_y_train.shape}")
print(f"[X-test]: {df_X_test.shape}")
print(f"[y-test]: {df_y_test.shape}")

[X-train]: (7905, 18)
[y-train]: (7905,)
[X-test]: (5271, 18)
[y-test]: (5271, 4)


#### Data Scaling

In [6]:
if CATEGORICAL_TO_NUMERICAL:
    # train subset
    df_X_train = categorical_to_numerical(
        df_X_train, categorical_features, transformer="ordinal"
    ).copy(deep=True)

    # test subset
    df_X_test = categorical_to_numerical(
        df_X_test, categorical_features, transformer="ordinal"
    ).copy(deep=True)

if ENCODE_LABEL:
    # this ensures that the label order is the same one as in sample_submission
    encoded_label = encode_label(df_y_train.values, label_order)
    df_y_train = pd.DataFrame(encoded_label, columns=["Status"])

## **Model - Train, Validate, GridsearchCV**

In [8]:
# class BaseModel:
#     def fit(self, X, y, **kwargs):
#         """Fit the model to the data. Specific arguments can be passed in kwargs."""
#         raise NotImplementedError("This method should be implemented by subclasses.")

#     def predict(self, X):
#         """Make predictions using the fitted model."""
#         raise NotImplementedError("This method should be implemented by subclasses.")

In [48]:
class SoftmaxRegressionNN:
    def __init__(self, **kwargs):
        self.model = self.compile(**kwargs)

    def __getattr__(self, attr):
        """
        Delegate attribute access to the underlying model if it's not found in this class.
        If it's a callable attribute, wrap it to modify behavior.
        """
        orig_attr = getattr(self.model, attr)
        if callable(orig_attr):

            def hooked(*args, **kwargs):
                # Modify args or kwargs here before passing them to the original function
                # For example, let's filter out kwargs for 'fit' method specifically
                if attr == "fit":
                    allowed_kwargs = [
                        "epochs",
                        "batch_size",
                        "verbose",
                        # "callbacks",
                        # "validation_data",
                    ]
                    kwargs_ = {k: v for k, v in kwargs.items() if k in allowed_kwargs}

                # Now call the original attribute with potentially modified arguments
                return orig_attr(*args, **kwargs_)

            return hooked
        else:
            return orig_attr

    def compile(self, **kwargs) -> callable:
        allowed_kwargs = [
            "learning_rate",
        ]
        kwargs_ = {k: v for k, v in kwargs.items() if k in allowed_kwargs}

        model = tf.keras.models.Sequential(
            layers=[
                tf.keras.layers.Dense(25, "relu"),
                tf.keras.layers.Dense(10, "relu"),
                tf.keras.layers.Dense(3, "linear"),
            ],
            name="idk",
        )

        model.compile(
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            optimizer=tf.keras.optimizers.legacy.Adam(
                **kwargs_
            ),  # legacy.Adam - Adam runs slow on M1/M2 chips
            metrics=["sparse_categorical_crossentropy"],
        )

        return model

    def predict(self, X: np.array, **kwargs) -> tuple[np.array, np.array]:
        allowed_kwargs = [
            "verbose",
        ]
        kwargs_ = {k: v for k, v in kwargs.items() if k in allowed_kwargs}

        prediction = self.model.predict(X, **kwargs_)
        y_pred_proba = tf.nn.softmax(prediction).numpy()
        y_pred = np.argmax(y_pred_proba, axis=1)

        return y_pred, y_pred_proba

In [None]:
# class ScikitLearnModel:
#     def __init__(self, model):
#         self.model = model

#     def fit(self, X, y, **kwargs):
#         # Filter for scikit-learn-specific arguments or ignore all kwargs
#         sklearn_args = {k: v for k, v in kwargs.items() if k in ["sample_weight"]}
#         self.model.fit(X, y, **sklearn_args)

#     def predict(self, X):
#         return self.model.predict(X)

## **Cross-Validation: Stratified K-Fold**

In [None]:
# Option to do standard procedure: train, dev, test (sequencially by ID order) (1 run)
# Option to do k-fold (stratified): train, dev, test (order via k-fold procedure) (2+ runs)

# train - fit model
# dev - hyper-parameter tuning
# test - evaluate log-loss / accuracy / other metric

In [49]:
from sklearn.model_selection import StratifiedKFold, KFold


class CrossValidation:
    def __init__(
        self,
        df_X: pd.DataFrame,
        df_y: pd.DataFrame,
        numerical_scale: bool,
        numerical_features: list = None,
    ) -> None:
        """ """
        self.X = df_X.values
        self.y = df_y.values

        self.n_features = df_X.shape[1]
        self.numerical_scale = numerical_scale

        if numerical_features is not None:
            self.numerical_indexes = self.get_numerical_indexes(
                df_X, numerical_features
            )
            self.categorical_indexes = self.get_categorical_indexes()

    def get_numerical_indexes(
        self, df_X: pd.DataFrame, num_features: list
    ) -> list[int]:
        """ """
        if self.numerical_scale:
            return [df_X.columns.get_loc(column) for column in num_features]
        else:
            return []

    def get_categorical_indexes(self) -> list[int]:
        """ """
        if self.numerical_scale:
            return list(set(np.arange(self.n_features)) - set(self.numerical_indexes))
        else:
            return []

    def check_numerical_scaling(
        self, X_train: np.array, X_validation: np.array
    ) -> tuple[np.array, np.array]:
        """ """

        if self.numerical_scale:
            transformed_data, numerical_transformer = numerical_scaling(
                X_train[:, self.numerical_indexes], None
            )

            X_train = np.concatenate(
                (transformed_data, X_train[:, self.categorical_indexes]), axis=1
            )

            transformed_data, _ = numerical_scaling(
                X_validation[:, self.numerical_indexes], numerical_transformer
            )

            X_validation = np.concatenate(
                (transformed_data, X_validation[:, self.categorical_indexes]), axis=1
            )

        return X_train, X_validation

    def one_split(
        self, train_index: np.array, validation_index: np.array
    ) -> tuple[float, float]:
        """ """

        # split in train and validation subsets
        X_train, X_validation = self.X[train_index], self.X[validation_index]
        y_train, y_validation = self.y[train_index], self.y[validation_index]

        X_train, X_validation = self.check_numerical_scaling(X_train, X_validation)

        # fit model
        self.model.fit(X_train, y_train, **self.kwargs)

        # predict label
        y_pred, y_pred_proba = self.model.predict(X_validation, **self.kwargs)

        # predict evaluation metrics
        acc = accuracy_score(y_validation, y_pred)
        ll = log_loss(y_validation, y_pred_proba)

        return acc, ll

    def run(
        self,
        model: callable,
        split_method: str = "standard",
        shuffle: bool = False,
        random_state: float = None,
        n_splits: int = 5,
        test_size: float = 0.3,
        **kwargs,
    ):
        """
        Execute the cross-validation process with the specified model and splitting strategy.

        Parameters:
        - model (callable): The machine learning model to train and evaluate.
        - split_method (str): The type of cross-validation split ('standard', 'kfold', 'stratified_kfold').
        - shuffle (bool): Whether to shuffle data before splitting.
        - random_state (int, optional): Seed for random number generator.
        - n_splits (int): Number of folds for k-fold strategies.
        - test_size (float): Proportion of the dataset to include in the test split (for 'standard' strategy).
        - kwargs: Additional keyword arguments to pass to the model training method.

        Returns:
        - dict: Dictionary containing accuracy and log loss history and their averages.
        """

        self.model = model
        self.kwargs = kwargs

        if split_method == "standard":
            indexes = np.arange(self.X.shape[0])
            splits = [
                train_test_split(
                    indexes, test_size=test_size, random_state=random_state
                )
            ]
        elif split_method == "kfold":
            kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
            splits = kf.split(self.X, self.y)
        elif split_method == "stratified_kfold":
            skf = StratifiedKFold(
                n_splits=n_splits, shuffle=shuffle, random_state=random_state
            )
            splits = skf.split(self.X, self.y)
        else:
            raise ValueError(f"Unsupported split method: {split_method}")

        self.history = {"accuracy": [], "log_loss": []}
        for train_index, validation_index in splits:
            acc, ll = self.one_split(train_index, validation_index)

            self.history["accuracy"].append(acc)
            self.history["log_loss"].append(ll)

        self.history["avg_accuracy"] = np.mean(self.history["accuracy"])
        self.history["avg_log_loss"] = np.mean(self.history["log_loss"])

        return self.history

In [23]:
# param_options = {"learning_rate": [0.1, 0.01, 0.001, 0.0001], "epochs": [30, 50, 100]}
# for model in models:
#     model.fit(X_train, y_train, **kwargs)

In [52]:
import itertools

gridsearch_kwargs = {
    "learning_rate": [0.01, 0.001],
    "epochs": [30, 50],
    "batch_size": [50, 70],
    "verbose": [0],
}

# Create a list of keys and a list of lists of values
keys = list(gridsearch_kwargs.keys())
values = list(gridsearch_kwargs.values())

# Generate all combinations of the parameter values
all_combinations = itertools.product(*values)

# Print each combination as a formatted dictionary
for combination in all_combinations:
    kwargs = dict(zip(keys, combination))
    print(kwargs)

    model = SoftmaxRegressionNN(**kwargs)

    cv = CrossValidation(
        df_X_train,
        df_y_train["Status"],
        numerical_scale=True,
        numerical_features=numerical_features,
    )

    history = cv.run(
        model, "standard", shuffle=True, random_state=None, **kwargs
    )

    print(
        "[Validation Set] Average Accuracy: %.2f" % (history["avg_accuracy"] * 100), "%"
    )
    print("[Validation Set] Average Log-loss: %.2f \n" % history["avg_log_loss"])

{'learning_rate': 0.01, 'epochs': 30, 'batch_size': 50, 'verbose': 0}
[0.01399035 0.13731262 0.848697  ]
[Validation Set] Average Accuracy: 78.92 %
[Validation Set] Average Log-loss: 0.54 

{'learning_rate': 0.01, 'epochs': 30, 'batch_size': 70, 'verbose': 0}
[0.98375064 0.01158628 0.00466314]
[Validation Set] Average Accuracy: 79.76 %
[Validation Set] Average Log-loss: 0.53 

{'learning_rate': 0.01, 'epochs': 50, 'batch_size': 50, 'verbose': 0}
[0.93076366 0.00870962 0.06052677]
[Validation Set] Average Accuracy: 77.87 %
[Validation Set] Average Log-loss: 0.61 

{'learning_rate': 0.01, 'epochs': 50, 'batch_size': 70, 'verbose': 0}
[0.06579856 0.02496652 0.90923494]
[Validation Set] Average Accuracy: 80.82 %
[Validation Set] Average Log-loss: 0.59 

{'learning_rate': 0.001, 'epochs': 30, 'batch_size': 50, 'verbose': 0}
[0.9754288  0.01470015 0.00987106]
[Validation Set] Average Accuracy: 80.44 %
[Validation Set] Average Log-loss: 0.51 

{'learning_rate': 0.001, 'epochs': 30, 'batch_siz

## **Predict Test Sample**

In [None]:
# y_test_proba = models["log_reg"].predict_proba(X_test)

# Neural network
prediction = model.predict(X_test)
y_test_pred_proba = tf.nn.softmax(prediction)


# Prepare submission DataFrame
df_y_test_pred_proba = pd.DataFrame(
    y_test_pred_proba, columns=["Status_C", "Status_CL", "Status_D"]
)

df_y_test_pred_proba = pd.concat([df_y_test["id"], df_y_test_pred_proba], axis=1)

display(df_y_test_pred_proba.head())
display(df_y_test_pred_proba.tail())

## **Submission**

In [None]:
if KAGGLE:
    # Save submission to a CSV file
    df_y_test_pred_proba.to_csv("submission.csv", index=False)

**NEXT STEPS**