In [1]:
import warnings
warnings.filterwarnings("ignore") 
import pandas as pd
import polars as pl
from catboost import CatBoostClassifier

In [2]:
dir = r"/kaggle/input/fyp-dataset/"

train_df = pl.read_csv(dir+"training_dataset.csv")
test_df =pl.read_csv(dir+"testing_dataset.csv")

In [3]:
def preprocess(df):
    impaired_list = [col for col in train_df.columns if col.startswith('impaired')]
    feature_df = df.drop(['SEQN', 'func_score'] + impaired_list)
    target_df = df.select(pl.col(impaired_list[:3]))
    print(f"Feature shape = {feature_df.shape}")
    print(f"Target shape = {target_df.shape}")
    return feature_df, target_df

In [4]:
x_train, y_train = preprocess(train_df)
print()
x_test, y_test = preprocess(test_df)

x_train = x_train.to_pandas()
x_test = x_test.to_pandas()
y_train = y_train.to_pandas()
y_test = y_test.to_pandas()

Feature shape = (2221, 23)
Target shape = (2221, 3)

Feature shape = (556, 23)
Target shape = (556, 3)


# Modeling: Catboost baseline

In [5]:
from sklearn.metrics import roc_auc_score

def result(predict,y_test):
    print("")
    test_auc = roc_auc_score(y_test, predict[:, 1])
    print(f"Testing AUC: {round(test_auc, 4)}")

In [6]:
device = 'CPU'
targets = [f'impaired_{i}' for i in range(1, 4)]
random_state = 1024

In [7]:
for target in targets:
    model = CatBoostClassifier(task_type=device, silent = True, random_state=random_state)
    model.fit(x_train, y_train[target])
    y_pred = model.predict_proba(x_test)
    
    print(f"Target Variable: {target}")
    print("")
    display(y_test[target].value_counts())
    result(y_pred, y_test[target])
    print("="*50)

Target Variable: impaired_1



impaired_1
0    520
1     36
Name: count, dtype: int64


Testing AUC: 0.6506
Target Variable: impaired_2



impaired_2
0    545
1     11
Name: count, dtype: int64


Testing AUC: 0.6015
Target Variable: impaired_3



impaired_3
0    553
1      3
Name: count, dtype: int64


Testing AUC: 0.66


# Modeling: Catboost with Optuna

In [8]:
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

def objective(trial):
    params = {
        'eval_metric': 'AUC',
        'loss_function': 'Logloss',
        'iterations':  trial.suggest_int('iterations', 900, 1300),
        'learning_rate': trial.suggest_float("learning_rate", 0.001, 0.10),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.01, 10.0),
        'subsample': trial.suggest_float('subsample', 0.3, 1.0)
    }
    

    optuna_model = CatBoostClassifier(**params, task_type=device, silent = True, random_state=random_state)
    optuna_model.fit(x_train, y_train[target])
    y_pred = optuna_model.predict_proba(x_test)
    test_auc = roc_auc_score(y_test[target], y_pred[:, 1])
    return test_auc

In [9]:
def optuna_plot(study):
    fig = optuna.visualization.plot_parallel_coordinate(study)
    fig1 = optuna.visualization.plot_param_importances(study)
    fig.show()
    fig1.show()

In [10]:
def best_params(study):
    print()
    print(f'Trials = {len(study.trials)}')
    trial = study.best_trial
    print(f'Best trial result = {trial.value}')
    print('  Hyper-Params: ')
    for key, value in trial.params.items():
        print(f'    {key}: {value}')

    optuna_plot(study)
    return trial

In [11]:
best_trial = []
n_trials = 1000

In [12]:
%%time

for target in targets:
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials, gc_after_trial=True)
    print("")
    print(f"Target Variable: {target}")
    print("")
    trial = best_params(study)
    best_trial.append(trial)
    print("="*200)


Target Variable: impaired_1


Trials = 1000
Best trial result = 0.7107371794871795
  Hyper-Params: 
    iterations: 939
    learning_rate: 0.03178262118782636
    max_depth: 6
    l2_leaf_reg: 9.975112623564634
    subsample: 0.33495896460232616



Target Variable: impaired_2


Trials = 1000
Best trial result = 0.7858215179316097
  Hyper-Params: 
    iterations: 1157
    learning_rate: 0.05604540684377419
    max_depth: 4
    l2_leaf_reg: 0.011911073410564483
    subsample: 0.5318569107893494



Target Variable: impaired_3


Trials = 1000
Best trial result = 0.925858951175407
  Hyper-Params: 
    iterations: 1176
    learning_rate: 0.0962056323802748
    max_depth: 6
    l2_leaf_reg: 0.11774235564547274
    subsample: 0.4125948557768463


CPU times: user 13h 23min 23s, sys: 1h 26min 53s, total: 14h 50min 17s
Wall time: 4h 13min 43s


# Final Result

In [13]:
for i, target in enumerate(targets):
    params = best_trial[i].params
    model = CatBoostClassifier(**params, task_type=device, silent = True, random_state=1024)
    model.fit(x_train, y_train[target])
    y_pred = model.predict_proba(x_test)
    
    print(f"Target Variable: {target}")
    result(y_pred, y_test[target])
    print("="*50)

Target Variable: impaired_1

Testing AUC: 0.7107
Target Variable: impaired_2

Testing AUC: 0.7858
Target Variable: impaired_3

Testing AUC: 0.9259
