In [1]:
import time
import numpy as np
import pandas as pd
import warnings
def ignore_warn(*args,**kwargs):
    pass
warnings.warn = ignore_warn
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import f1_score
import xgboost as xgboost
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.experimental import enable_hist_gradient_boosting  
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import StackingClassifier, VotingClassifier

In [2]:
train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
final = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv')

In [3]:
encoder = LabelEncoder()
train.loc[:, 'EJ'] = encoder.fit_transform(train.loc[:, 'EJ'])
test.loc[:, 'EJ'] = encoder.fit_transform(test.loc[:, 'EJ'])

In [4]:
# taken directly from https://www.kaggle.com/code/chensilin/icr-eda-lightgbm-xgboost-optuna/input
seed = 617
zero, one = np.bincount(train.loc[:, 'Class'])
one_df = train.iloc[(train.loc[:, 'Class'] == 1).tolist(), :] 
zero_df = train.iloc[(train.loc[:, 'Class'] == 0).tolist(), :]
zero_df = zero_df.sample(n=one, random_state=seed)
oversampled_df = pd.concat([train.iloc[(train.loc[:, 'Class'] == 0).tolist(), :], one_df, one_df, one_df, one_df])
oversampled_df = oversampled_df.sample(frac=1, random_state=seed)

In [5]:
# taken directly from https://www.kaggle.com/code/chensilin/icr-eda-lightgbm-xgboost-optuna/input
def balanced_log_loss(y_true, y_pred):
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)
    w_0 = 1 / N_0
    w_1 = 1 / N_1
    p_1 = np.clip(y_pred[:, 1], 1e-15, 1-1e-15)
    p_0 = 1 - p_1
    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0))
    log_loss_1 = -np.sum(y_true * np.log(p_1))
    balanced_log_loss = (w_0 * log_loss_0 + w_1 * log_loss_1) / 2
    return balanced_log_loss

In [6]:
# taken directly from https://www.kaggle.com/code/chensilin/icr-eda-lightgbm-xgboost-optuna/input
n_folds = 10
def CV(model, data, loss_function):
    skf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
    kfold = skf.split(data.iloc[:,1:-1], data.iloc[:, -1])
    losses = []
    for (train_id, val_id) in kfold:
        x_train = data.iloc[train_id, 1:-1]
        y_train = data.iloc[train_id, -1]
        x_val = data.iloc[val_id, 1:-1]
        y_val = data.iloc[val_id, -1]
        model.fit(x_train, y_train)
        pred_val = model.predict_proba(x_val)
        loss = loss_function(y_val, pred_val)
        losses.append(loss)
    return np.sum(losses) / n_folds

In [7]:
X_train = oversampled_df.drop(columns=['Class', 'Id'])
y_train = oversampled_df['Class']

In [8]:
def xgb(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200, 10),
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4, 0.5, 0.6, 0.7, 0.8, 1.0]),
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-1),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
    }
    model = xgb.XGBClassifier(**param, random_state = seed)
    score = CV(model, oversampled_df, balanced_log_loss)
    return score

In [9]:
def lgbm(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200, 10),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-1),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'num_leaves' : trial.suggest_int('num_leaves', 10, 50),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 50),
    }
    model = lgb.LGBMClassifier(**param, random_state = seed)
    score = CV(model, oversampled_df, balanced_log_loss)
    return score

In [10]:
def catboost(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 150, 10),
        'reg_lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'colsample_bylevel': trial.suggest_categorical('colsample_bylevel', [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4, 0.5, 0.6, 0.7, 0.8, 1.0]),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'one_hot_max_size': trial.suggest_int('one_hot_max_size', 2, 10),
    }
    model = CatBoostClassifier(**param, random_seed=seed, verbose=False)
    score = CV(model, oversampled_df, balanced_log_loss)
    return score

In [11]:
def hgbc(trial):
    param = {
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_iter': trial.suggest_int('max_iter', 50, 200, 10),
    }
    model = HistGradientBoostingClassifier(**param, random_state=seed)
    score = CV(model, oversampled_df, balanced_log_loss)
    return score

In [12]:
#start = time.time()

#study1 = optuna.create_study(direction='minimize', study_name="CatBoost Stacking")
#n_trials = 25
#study1.optimize(catboost_meta, n_trials=n_trials)
#print('Best trial:', study1.best_trial.params)
#print('Best values:', study1.best_value)

#end = time.time()
#print('It has taken {:.5f} seconds to search for the best Hyperparameter'.format(end-start))

In [13]:
xgb_model = xgboost.XGBClassifier(n_estimators=200, reg_lambda=0.15325900166549988, reg_alpha=0.0044700650580230235, 
                              colsample_bytree=0.3, subsample=1.0, learning_rate=0.09975673376458177, 
                              max_depth=11, min_child_weight=2, random_state=seed).fit(X_train, y_train)
xgb = xgb_model.predict_proba(test.iloc[:, 1:])

In [14]:
lgbm_model = lgb.LGBMClassifier(n_estimators=200, reg_alpha=0.0016725623110267532, reg_lambda=0.0038043774323061946, 
                                 colsample_bytree=0.3, subsample=0.4, learning_rate=0.09367295744238123, max_depth=11, 
                                 num_leaves=50, min_child_samples=26, random_state=seed).fit(X_train, y_train)
lgbm = lgbm_model.predict_proba(test.iloc[:, 1:])

In [15]:
cat_model = CatBoostClassifier(n_estimators=70, reg_lambda=0.01606738047167, colsample_bylevel=0.3, 
                              subsample=0.7, learning_rate=0.0865881098465479, 
                              max_depth=9, one_hot_max_size=10, random_state=seed, verbose=False).fit(X_train, y_train)
cat = cat_model.predict_proba(test.iloc[:, 1:])

In [16]:
hgbc_model = HistGradientBoostingClassifier(max_iter=170, max_depth=4, min_samples_leaf=9,
                                         learning_rate=0.17193627413211837, random_state=seed).fit(X_train, y_train)
hgbc = hgbc_model.predict_proba(test.iloc[:, 1:])

In [17]:
def catboost_meta(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 150, 10),
        'reg_lambda': trial.suggest_loguniform('lambda', 1e-3, 0.1),
        'colsample_bylevel': trial.suggest_categorical('colsample_bylevel', [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4, 0.5, 0.6, 0.7, 0.8, 1.0]),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1e-1),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'one_hot_max_size': trial.suggest_int('one_hot_max_size', 2, 10),
    }
    model = StackingClassifier(
        estimators=[("catboost", cat_model), ("hist_gradient_boosting", hgbc_model),
                    ("lgbm", lgbm_model), ("xgboost", xgb_model)],
        final_estimator=CatBoostClassifier(**param, random_state=seed, verbose=False)
    )
    score = CV(model, oversampled_df, balanced_log_loss)
    return score

In [18]:
def xgb_meta(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200, 10),
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 0.1),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 0.1),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4, 0.5, 0.6, 0.7, 0.8, 1.0]),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
    }
    model = StackingClassifier(
        estimators=[("catboost", cat_model), ("hist_gradient_boosting", hgbc_model),
                    ("lgbm", lgbm_model), ("xgboost", xgb_model)],
        final_estimator=xgboost.XGBClassifier(**param, random_state=seed, verbose=False)
    )
    score = CV(model, oversampled_df, balanced_log_loss)
    return score

In [19]:
def hgbc_meta(trial):
    param = {
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_iter': trial.suggest_int('max_iter', 50, 200, 10),
    }
    model = StackingClassifier(
        estimators=[("catboost", cat_model), ("hist_gradient_boosting", hgbc_model),
                    ("lgbm", lgbm_model), ("xgboost", xgb_model)],
        final_estimator=HistGradientBoostingClassifier(**param, random_state=seed, verbose=False)
    )
    score = CV(model, oversampled_df, balanced_log_loss)
    return score

In [20]:
def lgbm_meta(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200, 10),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 0.1),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 0.1),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'num_leaves' : trial.suggest_int('num_leaves', 10, 50),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 50),
    }
    model = StackingClassifier(
        estimators=[("catboost", cat_model), ("hist_gradient_boosting", hgbc_model),
                    ("lgbm", lgbm_model), ("xgboost", xgb_model)],
        final_estimator=lgb.LGBMClassifier(**param, random_state=seed, verbose=False)
    )
    score = CV(model, oversampled_df, balanced_log_loss)
    return score

In [21]:
final.iloc[:, 1] = (xgb[:, 0] + lgbm[:, 0] + hgbc[:, 0] + cat[:, 0]) / 4
final.iloc[:, -1] = (xgb[:, 1] + lgbm[:, 1] + hgbc[:, 1] + cat[:, 1]) / 4
final.to_csv('submission.csv', index=False)
submission = pd.read_csv('submission.csv')
submission

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.964072,0.035928
1,010ebe33f668,0.964072,0.035928
2,02fa521e1838,0.964072,0.035928
3,040e15f562a2,0.964072,0.035928
4,046e85c7cc7f,0.964072,0.035928
