In [3]:
import warnings
warnings.filterwarnings("ignore") 
import pandas as pd
import polars as pl
from catboost import CatBoostClassifier

In [4]:
dir = r"/kaggle/input/fyp-dataset2/"

train_df = pl.read_csv(dir+"training_full.csv")
test_df =pl.read_csv(dir+"testing_full.csv")

In [5]:
def preprocess(df):
    impaired_list = [col for col in train_df.columns if col.startswith('impaired')]
    feature_df = df.drop(['SEQN', 'func_score'] + impaired_list)
    target_df = df.select(pl.col(impaired_list[:3]))
    print(f"Feature shape = {feature_df.shape}")
    print(f"Target shape = {target_df.shape}")
    return feature_df, target_df

In [6]:
x_train, y_train = preprocess(train_df)
print()
x_test, y_test = preprocess(test_df)

x_train = x_train.to_pandas()
x_test = x_test.to_pandas()
y_train = y_train.to_pandas()
y_test = y_test.to_pandas()

Feature shape = (2221, 23)
Target shape = (2221, 3)

Feature shape = (556, 23)
Target shape = (556, 3)


# Modeling: Catboost baseline

In [7]:
from sklearn.metrics import roc_auc_score

def result(predict,y_test):
    print("")
    test_auc = roc_auc_score(y_test, predict[:, 1])
    print(f"Testing AUC: {round(test_auc, 4)}")

In [8]:
device = 'CPU'
targets = [f'impaired_{i}' for i in range(1, 4)]
random_state = 975

In [9]:
for target in targets:
    model = CatBoostClassifier(task_type=device, silent = True, random_state=random_state)
    model.fit(x_train, y_train[target])
    y_pred = model.predict_proba(x_test)
    
    print(f"Target Variable: {target}")
    print("")
    display(y_test[target].value_counts())
    result(y_pred, y_test[target])
    print("="*50)

Target Variable: impaired_1



impaired_1
0    520
1     36
Name: count, dtype: int64


Testing AUC: 0.6514
Target Variable: impaired_2



impaired_2
0    545
1     11
Name: count, dtype: int64


Testing AUC: 0.6324
Target Variable: impaired_3



impaired_3
0    553
1      3
Name: count, dtype: int64


Testing AUC: 0.7511


# Modeling: Catboost with Optuna

In [10]:
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [11]:
# Optuna tune
def objective(trial):
    params = {
        'eval_metric': 'AUC',
        'loss_function': 'Logloss',
        'iterations':  trial.suggest_int('iterations', 800, 1200),
        'learning_rate': trial.suggest_float("learning_rate", 0.001, 0.10),
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.01, 10.0),
#         'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'colsample_bylevel': trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.1, 10.0),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli"]
        )
    }

    if params["bootstrap_type"] == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif params["bootstrap_type"] == "Bernoulli":
        params["subsample"] = trial.suggest_float("subsample", 0.3, 1)

#     gbm = cb.CatBoostClassifier(**param)

#     gbm.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], verbose=0, early_stopping_rounds=100)

#     preds = gbm.predict(valid_x)
#     pred_labels = np.rint(preds)
#     accuracy = accuracy_score(valid_y, pred_labels)
#     return accuracy

    optuna_model = CatBoostClassifier(**params, task_type=device, silent = True, random_state=random_state)
    optuna_model.fit(x_train, y_train[target])
    y_pred = optuna_model.predict_proba(x_test)
    test_auc = roc_auc_score(y_test[target], y_pred[:, 1])
    return test_auc

In [12]:
# # CODE MANNIX
# def objective(trial):
#     params = {
#         'eval_metric': 'AUC',
#         'loss_function': 'Logloss',
#         'iterations':  trial.suggest_int('iterations', 800, 1200),
#         'learning_rate': trial.suggest_float("learning_rate", 0.001, 0.10),
#         'max_depth': trial.suggest_int('max_depth', 4, 10),
#         'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.01, 10.0),
#         'subsample': trial.suggest_float('subsample', 0.3, 1.0)
#     }
    

#     optuna_model = CatBoostClassifier(**params, task_type=device, silent = True, random_state=random_state)
#     optuna_model.fit(x_train, y_train[target])
#     y_pred = optuna_model.predict_proba(x_test)
#     test_auc = roc_auc_score(y_test[target], y_pred[:, 1])
#     return test_auc

In [13]:
def optuna_plot(study):
    fig = optuna.visualization.plot_parallel_coordinate(study)
    fig1 = optuna.visualization.plot_param_importances(study)
    fig.show()
    fig1.show()

In [14]:
def best_params(study):
    print()
    print(f'Trials = {len(study.trials)}')
    trial = study.best_trial
    print(f'Best trial result = {trial.value}')
    print('  Hyper-Params: ')
    for key, value in trial.params.items():
        print(f'    {key}: {value}')

#     optuna_plot(study)
    return trial

In [15]:
best_trial = []
n_trials = 1000

In [None]:
%%time

for target in targets:
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials, gc_after_trial=True)
    print("")
    print(f"Target Variable: {target}")
    print("")
    trial = best_params(study)
    best_trial.append(trial)
    print("="*200)

# Final Result

In [None]:
for i, target in enumerate(targets):
    params = best_trial[i].params
    model = CatBoostClassifier(**params, task_type=device, silent = True, random_state=random_state)
    model.fit(x_train, y_train[target])
    y_pred = model.predict_proba(x_test)
    
    print(f"Target Variable: {target}")
    result(y_pred, y_test[target])
    print("="*50)