In [1]:
%load_ext autoreload
%autoreload 2

#### **Libraries**

In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
    OrdinalEncoder,
    LabelEncoder,
)

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss, accuracy_score

from catboost import CatBoostClassifier

#### **Config**

In [3]:
config = {
    "kaggle": False,
    "categorical_features": [
        "Drug",
        "Sex",
        "Ascites",
        "Hepatomegaly",
        "Edema",
        "Spiders",
    ],
    "numerical_features": [
        "N_Days",
        "Age",
        "Bilirubin",
        "Cholesterol",
        "Albumin",
        "Copper",
        "Alk_Phos",
        "SGOT",
        "Tryglicerides",
        "Platelets",
        "Prothrombin",
        "Stage",
    ],
    "target": "Status",
    "label_order": ["D", "CL", "C"],
    "random_seed": 42,
}

#### **Pre-Process Data**

In [4]:
def load_data(kaggle: bool) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    if kaggle:
        train = pd.read_csv("/kaggle/input/playground-series-s3e26/train.csv")
        test = pd.read_csv("/kaggle/input/playground-series-s3e26/test.csv")
        original = pd.read_csv(
            "/kaggle/input/cirrhosis-patient-survival-prediction/cirrhosis.csv"
        )
        sub = pd.read_csv("/kaggle/input/playground-series-s3e26/sample_submission.csv")
    else:
        train = pd.read_csv("./data/train.csv")
        test = pd.read_csv("./data/test.csv")
        original = pd.read_csv("./data/cirrhosis.csv")
        sub = pd.read_csv("./data/sample_submission.csv")

    return train, test, original, sub


def get_numerical_and_categorical_indexes(
    df: pd.DataFrame, numerical_features: list[str]
) -> tuple[list, list]:
    """ """

    n_features = df.shape[1]

    numerical_indexes = [df.columns.get_loc(column) for column in numerical_features]

    categorical_indexes = list(set(np.arange(n_features)) - set(numerical_indexes))

    return numerical_indexes, categorical_indexes


def categorical_preprocess(
    df: pd.DataFrame, features: list[str], encoder: str
) -> pd.DataFrame:
    df_ = df.copy(deep=True)

    if encoder == "ordinal":
        encoder = OrdinalEncoder(handle_unknown="error")
    elif encoder == "one-hot":
        encoder = OneHotEncoder(handle_unknown="error")

    df_[features] = encoder.fit_transform(df_[features])

    return df_


def target_preprocess(
    df: pd.DataFrame, target: str, label_order: list[str]
) -> pd.DataFrame:
    df_ = df.copy(deep=True)

    encoder = LabelEncoder()
    encoder.fit(label_order)

    df_[target] = encoder.transform(df_[target])

    return df_


def numerical_preprocess(
    x_train: np.array, 
    x_test: np.array, 
    numerical_indexes: list, 
    categorical_indexes: list
) -> tuple[np.array, np.array]:

    scaler = NumericalScaling(numerical_indexes, categorical_indexes)

    x_train = scaler.run(x_train, use_saved_transformer=False)
    x_test = scaler.run(x_test, use_saved_transformer=True)

    return x_train, x_test

class NumericalScaling:
    def __init__(self, numerical_indexes: list, categorical_indexes: list):
        self.numerical_indexes = numerical_indexes
        self.categorical_indexes = categorical_indexes

    def run(self, X_values: np.array, use_saved_transformer: bool) -> np.array:
        if not use_saved_transformer:
            # create transformer
            self.transformer = StandardScaler()

            # fit the transformer and get scaled data
            data = self.transformer.fit_transform(X_values[:, self.numerical_indexes])

        else:
            # scale data using existing transformer
            data = self.transformer.transform(X_values[:, self.numerical_indexes])

        X_values_ = np.concatenate(
            (data, X_values[:, self.categorical_indexes]), axis=1
        )

        return X_values_

In [34]:
train, test, original, sub = load_data(config["kaggle"])

config["numerical_indexes"], config["categorical_indexes"] = (
    get_numerical_and_categorical_indexes(
        train.drop(["id", "Status"], axis=1), config["numerical_features"]
    )
)

config["features"] = config["numerical_features"] + config["categorical_features"]

# train["is_generated"] = 1
# test["is_generated"] = 1
# original["is_generated"] = 0

In [36]:
# Process each DataFrame
for df_name in ["train", "test", "original"]:
    globals()[df_name] = categorical_preprocess(
        globals()[df_name], config["categorical_features"], "ordinal"
    )

    if df_name == "train":
        globals()[df_name] = target_preprocess(
            globals()[df_name], config["target"], config["label_order"]
        )

In [37]:
original.head()

Unnamed: 0,ID,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1,400,D,0.0,21464,0.0,1.0,1.0,1.0,2.0,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,2,4500,C,0.0,20617,0.0,0.0,1.0,1.0,0.0,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
2,3,1012,D,0.0,25594,1.0,0.0,0.0,0.0,1.0,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0
3,4,1925,D,0.0,19994,0.0,0.0,1.0,1.0,1.0,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
4,5,1504,CL,1.0,13918,0.0,0.0,1.0,1.0,0.0,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0


In [38]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    train[config["features"]].values,
    train[config["target"]].values,
    test_size=0.3,
    random_state=config["random_seed"],
)

x_train, x_test = numerical_preprocess(
    x_train, x_test, config["numerical_indexes"], config["categorical_indexes"]
)

#### **Grid Search + Cross-Validation**

In [13]:
"""
- iterations : number of boosting iterations (trees) to be used.
- learning_rate : controls the step size at each iteration while moving toward a minimum of the loss function.
- max_depth : maximum depth of individual trees.
- min_child_samples: minimum numberearly_stopping_rounds of samples required to be at a leaf node.
- early_stopping_rounds : number of rounds with no improvement after which training will be stopped.
- reg_lambda: L2 regularization term on weighs.
- subsample : Fraction of samples to use for fitting each tree, providing a trade-off between model robustness and randomness.
- bootstrap_type : Controls the method used to sample data for each tree in the ensemble.
"""

if not config["kaggle"]:
    param_grid = {
        "iterations": [50, 100, 200, 250, 500],
        "learning_rate": [0.01, 0.1, 0.25, 0.5],
        "max_depth": [5, 7, 9],
        # "min_child_samples": [10, 15, 20],
        "early_stopping_rounds": [20],
        "reg_lambda": [0.5, 0.6, 0.65, 0.7],
        # "subsample": [0.6, 0.8, 1],
        # "bootstrap_type": ["Bernoulli"]
    }

    clf = CatBoostClassifier(
        objective="MultiClass",
        random_seed=config["random_seed"],
        verbose=False,
    )
    grid_search = GridSearchCV(clf, param_grid=param_grid, cv=5)
    results = grid_search.fit(x_train, y_train, plot=False)
    results.best_estimator_.get_params(), results.best_score_

{'iterations': 250,
 'learning_rate': 0.1,
 'random_seed': 42,
 'verbose': False,
 'max_depth': 5,
 'reg_lambda': 0.65,
 'objective': 'MultiClass',
 'early_stopping_rounds': 20}

### **Predict**

In [14]:
parameters = {
    "iterations": 250,
    "learning_rate": 0.1,
    "random_seed": 42,
    "verbose": False,
    "max_depth": 5,
    "reg_lambda": 0.65,
    "objective": "MultiClass",
    "early_stopping_rounds": 20,
}

model = CatBoostClassifier(**parameters)
model.fit(x_train, y_train)

y_test_pred = model.predict(x_test)
y_test_pred_proba = model.predict_proba(x_test)

In [31]:
acc = accuracy_score(y_test, y_test_pred)
ll = log_loss(y_test, y_test_pred_proba)

print(
    "Accuracy: %.2f" % (acc * 100),
    "%",  "| Log Loss: %.4f \n" % ll
)

Accuracy: 82.72 % | Log Loss: 0.4496 



#### **Kaggle Submission**

In [43]:
if config["kaggle"]:
    # pre-process data
    x_train_sub = train[config["features"]].values
    y_train_sub = train[config["target"]].values
    x_test_sub = test[config["features"]].values

    x_train_sub, x_test_sub = numerical_preprocess(
        x_train_sub, x_test_sub, config["numerical_indexes"], config["categorical_indexes"]
    )

    # fit model
    model = CatBoostClassifier(**parameters)
    model.fit(x_train_sub, y_train_sub)

    # predict y_values
    y_test_pred_sub = model.predict(x_test_sub)
    y_test_pred_proba_sub = model.predict_proba(x_test_sub)

    # create submission dataframe
    submission = pd.DataFrame(
        y_test_pred_proba_sub, columns=["Status_D", "Status_CL", "Status_C"]
    )
    submission = pd.concat([test["id"], submission], axis=1)

    # save submission to a CSV file
    submission.to_csv("submission.csv", index=False)