In [1]:
import warnings
warnings.filterwarnings("ignore") 
import pandas as pd
import polars as pl
from catboost import CatBoostClassifier

In [2]:
dir = r"/kaggle/input/fyp-dataset/"

train_df = pl.read_csv(dir+"training_dataset.csv")
test_df =pl.read_csv(dir+"testing_dataset.csv")

In [3]:
def preprocess(df):
    impaired_list = [col for col in train_df.columns if col.startswith('impaired')]
    feature_df = df.drop(['SEQN', 'func_score'] + impaired_list)
    target_df = df.select(pl.col(impaired_list[:3]))
    print(f"Feature shape = {feature_df.shape}")
    print(f"Target shape = {target_df.shape}")
    return feature_df, target_df

In [4]:
x_train, y_train = preprocess(train_df)
print()
x_test, y_test = preprocess(test_df)

x_train = x_train.to_pandas()
x_test = x_test.to_pandas()
y_train = y_train.to_pandas()
y_test = y_test.to_pandas()

Feature shape = (2221, 23)
Target shape = (2221, 3)

Feature shape = (556, 23)
Target shape = (556, 3)


# Modeling: Catboost baseline

In [5]:
from sklearn.metrics import roc_auc_score

def result(predict,y_test):
    print("")
    test_auc = roc_auc_score(y_test, predict[:, 1])
    print(f"Testing AUC: {round(test_auc, 4)}")

In [6]:
device = 'CPU'
targets = [f'impaired_{i}' for i in range(1, 4)]
random_state = 1024

In [7]:
for target in targets:
    model = CatBoostClassifier(task_type=device, silent = True, random_state=random_state)
    model.fit(x_train, y_train[target])
    y_pred = model.predict_proba(x_test)
    
    print(f"Target Variable: {target}")
    print("")
    display(y_test[target].value_counts())
    result(y_pred, y_test[target])
    print("="*50)

Target Variable: impaired_1



impaired_1
0    520
1     36
Name: count, dtype: int64


Testing AUC: 0.6506
Target Variable: impaired_2



impaired_2
0    545
1     11
Name: count, dtype: int64


Testing AUC: 0.6015
Target Variable: impaired_3



impaired_3
0    553
1      3
Name: count, dtype: int64


Testing AUC: 0.66


# Modeling: Catboost with Optuna

In [8]:
import optuna

def objective(trial):
    params = {
        'eval_metric': 'AUC',
        'loss_function': 'Logloss',
        'iterations':  trial.suggest_int('iterations', 800, 1200),
        'learning_rate': trial.suggest_float("learning_rate", 0.001, 0.10),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.01, 10.0),
        'subsample': trial.suggest_float('subsample', 0.3, 1.0)
    }
    

    optuna_model = CatBoostClassifier(**params, task_type=device, silent = True, random_state=random_state)
    optuna_model.fit(x_train, y_train[target])
    y_pred = optuna_model.predict_proba(x_test)
    test_auc = roc_auc_score(y_test[target], y_pred[:, 1])
    return test_auc

In [9]:
def optuna_plot(study):
    fig = optuna.visualization.plot_parallel_coordinate(study)
    fig1 = optuna.visualization.plot_param_importances(study)
    fig.show()
    fig1.show()

In [10]:
def best_params(study):
    print()
    print(f'Trials = {len(study.trials)}')
    trial = study.best_trial
    print(f'Best trial result = {trial.value}')
    print('  Hyper-Params: ')
    for key, value in trial.params.items():
        print(f'    {key}: {value}')

    optuna_plot(study)
    return trial

In [11]:
best_trial = []
n_trials = 2

In [12]:
%%time

for target in targets:
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials, gc_after_trial=True)
    print("")
    print(f"Target Variable: {target}")
    print("")
    trial = best_params(study)
    best_trial.append(trial)
    print("="*200)

[I 2024-02-06 04:09:00,232] A new study created in memory with name: no-name-e5688716-3aff-4dbd-8a79-ad904c5196d2
[I 2024-02-06 04:09:05,003] Trial 0 finished with value: 0.6113247863247864 and parameters: {'iterations': 1199, 'learning_rate': 0.0033495306962680648, 'max_depth': 6, 'l2_leaf_reg': 0.06205023508446743, 'subsample': 0.892484853469802}. Best is trial 0 with value: 0.6113247863247864.
[I 2024-02-06 04:09:26,202] Trial 1 finished with value: 0.6408119658119658 and parameters: {'iterations': 1179, 'learning_rate': 0.014618775492362152, 'max_depth': 9, 'l2_leaf_reg': 5.234568078598188, 'subsample': 0.9736398285709145}. Best is trial 1 with value: 0.6408119658119658.



Target Variable: impaired_1


Trials = 2
Best trial result = 0.6408119658119658
  Hyper-Params: 
    iterations: 1179
    learning_rate: 0.014618775492362152
    max_depth: 9
    l2_leaf_reg: 5.234568078598188
    subsample: 0.9736398285709145


[I 2024-02-06 04:09:27,475] A new study created in memory with name: no-name-a99ba121-565e-40c1-b329-d7b735adc460




[I 2024-02-06 04:09:29,925] Trial 0 finished with value: 0.6363636363636364 and parameters: {'iterations': 1090, 'learning_rate': 0.03315076257796889, 'max_depth': 4, 'l2_leaf_reg': 1.2291040352072848, 'subsample': 0.8684830335969882}. Best is trial 0 with value: 0.6363636363636364.
[I 2024-02-06 04:09:36,558] Trial 1 finished with value: 0.6165137614678899 and parameters: {'iterations': 1102, 'learning_rate': 0.03289998710010241, 'max_depth': 7, 'l2_leaf_reg': 0.3626151141927054, 'subsample': 0.5991099456330999}. Best is trial 0 with value: 0.6363636363636364.



Target Variable: impaired_2


Trials = 2
Best trial result = 0.6363636363636364
  Hyper-Params: 
    iterations: 1090
    learning_rate: 0.03315076257796889
    max_depth: 4
    l2_leaf_reg: 1.2291040352072848
    subsample: 0.8684830335969882


[I 2024-02-06 04:09:36,837] A new study created in memory with name: no-name-2cd391d6-d5f1-4d10-bffb-85ce88192dd4




[I 2024-02-06 04:09:39,466] Trial 0 finished with value: 0.7203134418324292 and parameters: {'iterations': 932, 'learning_rate': 0.02877899974197153, 'max_depth': 5, 'l2_leaf_reg': 0.56069343916265, 'subsample': 0.6028699923215467}. Best is trial 0 with value: 0.7203134418324292.
[I 2024-02-06 04:09:45,901] Trial 1 finished with value: 0.6612417118746232 and parameters: {'iterations': 1097, 'learning_rate': 0.022274828540088765, 'max_depth': 7, 'l2_leaf_reg': 0.014356061876363933, 'subsample': 0.9100548054366722}. Best is trial 0 with value: 0.7203134418324292.



Target Variable: impaired_3


Trials = 2
Best trial result = 0.7203134418324292
  Hyper-Params: 
    iterations: 932
    learning_rate: 0.02877899974197153
    max_depth: 5
    l2_leaf_reg: 0.56069343916265
    subsample: 0.6028699923215467


CPU times: user 2min 34s, sys: 10.8 s, total: 2min 45s
Wall time: 45.9 s


# Final Result

In [13]:
for i, target in enumerate(targets):
    params = best_trial[i].params
    model = CatBoostClassifier(**params, task_type=device, silent = True, random_state=1024)
    model.fit(x_train, y_train[target])
    y_pred = model.predict_proba(x_test)
    
    print(f"Target Variable: {target}")
    result(y_pred, y_test[target])
    print("="*50)

Target Variable: impaired_1

Testing AUC: 0.6408
Target Variable: impaired_2

Testing AUC: 0.6364
Target Variable: impaired_3

Testing AUC: 0.7203
