# Use hyperparameters from: https://www.kaggle.com/code/tauilabdelilah/icr-hyperparameter-tuning-optuna version 12 Trial 1790

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import log_loss
import lightgbm as lgb
from catboost import Pool, CatBoostClassifier
import random
import optuna

import warnings
warnings.filterwarnings('ignore')

In [None]:
df   = pd.read_csv('/kaggle/input/icr-create-folds/train_folds.csv')
Test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')

In [None]:
df['EJ']   = df['EJ'].replace({'A': 0, 'B': 1})
Test['EJ'] = Test['EJ'].replace({'A': 0, 'B': 1})

df   = df.rename(columns={'BD ': 'BD', 'CD ': 'CD', 'CW ': 'CW', 'FD ': 'FD'})
Test = Test.rename(columns={'BD ': 'BD', 'CD ': 'CD', 'CW ': 'CW', 'FD ': 'FD'})

In [None]:
features = ['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', #'BC', 
            'BD', 'BN', 'BP', 'BQ', 'BR', 'BZ',
            'CB', 'CC', 'CD', 'CF', 'CH', #'CL', 
            'CR', 'CS', 'CU', 'CW',
            'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU', 'DV', 'DY',
            'EB', 'EE', 'EG', 'EH', 'EL', 'EP', 'EU',
            'FC', 'FD', 'FE', 'FI', 'FL', 'FR', 'FS',
            'GB', 'GE', 'GF', 'GH', 'GI', 'GL', 'EJ']
label    = df.columns[-2]

In [None]:
def balance_logloss(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
    y_pred / np.sum(y_pred, axis=1)[:, None]
    nc = np.bincount(y_true)
    w0, w1 = 1/(nc[0]/y_true.shape[0]), 1/(nc[1]/y_true.shape[0])
    
    logloss = (-w0/nc[0]*(np.sum(np.where(y_true==0,1,0) * np.log(y_pred[:,0]))) - w1/nc[1]*(np.sum(np.where(y_true!=0,1,0) * np.log(y_pred[:,1])))) / (w0+w1)
    
    return logloss

In [None]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
seed_everything(42)

In [None]:
def calc_log_loss_weight(y_true):
    nc = np.bincount(y_true)
    w0, w1 = 1/(nc[0]/y_true.shape[0]), 1/(nc[1]/y_true.shape[0])
    return w0, w1

# LGB

In [None]:
final_valid_predictions = {}
final_test_predictions = []
bs = []


for k in range(5):
    print('------------------ Fold: '+str(k))
    train     = df[df['kfold'] !=k].reset_index(drop=True)
    val       = df[df['kfold'] ==k].reset_index(drop=True)
    valid_ids = val.Id.values.tolist()
    
    train_w0, train_w1 = calc_log_loss_weight(train[label])
    valid_w0, valid_w1 = calc_log_loss_weight(val[label])

    train_dataset = lgb.Dataset(train[features], train[label],weight=train[label].map({0: train_w0, 1: train_w1}), categorical_feature=["EJ"] )
    eval_dataset  = lgb.Dataset(val[features], val[label], weight=val[label].map({0: valid_w0, 1: valid_w1}), categorical_feature=["EJ"])
    lgb_params = {
        'objective': 'binary', 
        'metric': 'binary_logloss', 
        'boosting': 'goss',
        'learning_rate': 0.09110460114828077,
        'num_leaves': 8,
        'feature_fraction': 0.4989639912997521,
        'bagging_fraction': 0.54872439795985,
        'lambda_l1': 1.4522184914523175, 
        'lambda_l2': 1.7873553090132748e-08,
        'n_jobs': -1,
        'is_unbalance':True, 
        'verbose': -1,
        'seed': 42,
    }

    model = lgb.train(
                params = lgb_params,
                train_set = train_dataset,
                num_boost_round = 50000,
                valid_sets = [train_dataset, eval_dataset],
                early_stopping_rounds = 20,
                verbose_eval = 10000,
            )

    preds_valid = model.predict(val[features])
    preds_test  = model.predict(Test[features])
    preds_valid = np.vstack([1 - preds_valid, preds_valid]).T
    preds_test  = np.vstack([1 - preds_test, preds_test]).T
    
    final_test_predictions.append(preds_test)
    final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))
    blogloss = balance_logloss(val[label], preds_valid)

    bs.append(blogloss)
    print(k, blogloss)
print('Balance Log loss:')
print(bs)
print(np.mean(bs), np.std(bs))

In [None]:
final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ['Id', 'class_0', 'class_1']
final_valid_predictions.to_csv(r"oof.csv", index=False)

final_test_predictions = (final_test_predictions[0] + final_test_predictions[1] + final_test_predictions[2] + final_test_predictions[3] + final_test_predictions[4])/5
test_dict = {}
test_dict.update(dict(zip(Test.Id.values.tolist(), final_test_predictions)))
submission = pd.DataFrame.from_dict(test_dict, orient="index").reset_index()
submission.columns = ['Id', 'class_0', 'class_1']                       

submission.to_csv(r"submission.csv", index=False)

In [None]:
#Befor checking accuracy let's first make sure that we have the data are sorted in the same way :)
final_valid_predictions   = final_valid_predictions.sort_values('Id')
Train                     = df.sort_values('Id')

print('balanced logarithmic loss for the baseline: '+str(balance_logloss(df['Class'], final_valid_predictions[['class_0', 'class_1']].values)))

In [None]:
submission