## Importing Libraries

In [None]:
!pip install tabpfn --no-index --find-links=file:///kaggle/input/pip-packages-icr/pip-packages
!mkdir -p /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
!cp /kaggle/input/pip-packages-icr/pip-packages/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/

In [8]:
import time
import numpy as np
import pandas as pd
import warnings
def ignore_warn(*args,**kwargs):
    pass
warnings.warn = ignore_warn
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import f1_score
import xgboost as xgboost
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, HistGradientBoostingClassifier, BaggingClassifier, StackingClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.experimental import enable_hist_gradient_boosting  
from sklearn.model_selection import cross_val_predict
from sklearn.impute import KNNImputer
from tabpfn import TabPFNClassifier

## Data Preprocessing

In [9]:
train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
final = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv')

In [10]:
encoder = LabelEncoder()
train.loc[:, 'EJ'] = encoder.fit_transform(train.loc[:, 'EJ'])
test.loc[:, 'EJ'] = encoder.fit_transform(test.loc[:, 'EJ'])

In [11]:
imputer = KNNImputer(n_neighbors = 10)
train_no_id = train.drop(['Id'], axis = 1)
train_no_id = imputer.fit_transform(train_no_id)
train_no_id = pd.DataFrame(train_no_id, columns = train.drop(['Id'], axis = 1).columns)
train = pd.concat([train['Id'], train_no_id], axis = 1)

In [12]:
# taken directly from https://www.kaggle.com/code/chensilin/icr-eda-lightgbm-xgboost-optuna/input
seed = 617
zero, one = np.bincount(train.loc[:, 'Class'])
one_df = train.iloc[(train.loc[:, 'Class'] == 1).tolist(), :] 
zero_df = train.iloc[(train.loc[:, 'Class'] == 0).tolist(), :]
zero_df = zero_df.sample(n=one, random_state=seed)
oversampled_df = pd.concat([train.iloc[(train.loc[:, 'Class'] == 0).tolist(), :], one_df, one_df, one_df, one_df])
oversampled_df = oversampled_df.sample(frac=1, random_state=seed)

In [13]:
# taken directly from https://www.kaggle.com/code/chensilin/icr-eda-lightgbm-xgboost-optuna/input
def balanced_log_loss(y_true, y_pred):
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)
    w_0 = 1 / N_0
    w_1 = 1 / N_1
    p_1 = np.clip(y_pred[:, 1], 1e-15, 1-1e-15)
    p_0 = 1 - p_1
    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0))
    log_loss_1 = -np.sum(y_true * np.log(p_1))
    balanced_log_loss = (w_0 * log_loss_0 + w_1 * log_loss_1) / 2
    return balanced_log_loss

In [14]:
# taken directly from https://www.kaggle.com/code/chensilin/icr-eda-lightgbm-xgboost-optuna/input
n_folds = 10
def CV(model, data, loss_function):
    skf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)
    kfold = skf.split(data.iloc[:, 1:-1], data.iloc[:, -1])
    losses = []
    for (train_id, val_id) in kfold:
        x_train = data.iloc[train_id, 1:-1]
        y_train = data.iloc[train_id, -1]
        x_val = data.iloc[val_id, 1:-1]
        y_val = data.iloc[val_id, -1]
        model.fit(x_train, y_train)
        pred_val = model.predict_proba(x_val)
        loss = loss_function(y_val, pred_val)
        losses.append(loss)
    return np.sum(losses) / n_folds

In [15]:
X_train = oversampled_df.drop(columns=['Class', 'Id'])
y_train = oversampled_df['Class']

## EDA

## Optuna Trial Exploration

In [16]:
def xgb(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200, 10),
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4, 0.5, 0.6, 0.7, 0.8, 1.0]),
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-1),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
    }
    model = xgb.XGBClassifier(**param, random_state = seed)
    score = CV(model, oversampled_df, balanced_log_loss)
    return score

In [17]:
def lgbm(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200, 10),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-1),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'num_leaves' : trial.suggest_int('num_leaves', 10, 50),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 50),
    }
    model = lgb.LGBMClassifier(**param, random_state = seed)
    score = CV(model, oversampled_df, balanced_log_loss)
    return score

In [18]:
def catboost(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 150, 10),
        'reg_lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'colsample_bylevel': trial.suggest_categorical('colsample_bylevel', [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4, 0.5, 0.6, 0.7, 0.8, 1.0]),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'one_hot_max_size': trial.suggest_int('one_hot_max_size', 2, 10),
    }
    model = CatBoostClassifier(**param, random_seed=seed, verbose=False)
    score = CV(model, oversampled_df, balanced_log_loss)
    return score

In [19]:
def hgbc(trial):
    param = {
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_iter': trial.suggest_int('max_iter', 50, 200, 10),
    }
    model = HistGradientBoostingClassifier(**param, random_state=seed)
    score = CV(model, oversampled_df, balanced_log_loss)
    return score

In [20]:
def bagged_dt(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 400, 600, 20),
        'max_samples': trial.suggest_float('max_samples', 1e-2, 1.0),
        'max_features': trial.suggest_float('max_features', 1e-2, 1.0),
        'bootstrap': trial.suggest_categorical('bootstrap', [False, True]),
        'bootstrap_features': trial.suggest_categorical('bootstrap_features', [False, True]),
    }
    model = BaggingClassifier(**param, base_estimator=DecisionTreeClassifier(), random_state=seed)
    score = CV(model, oversampled_df, balanced_log_loss)
    return score

In [21]:
def rf(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 300, 500, 20),
        'max_depth': trial.suggest_int('max_depth', 5, 25),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_features': trial.suggest_int('max_features', 1, 56),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
    }
    model = RandomForestClassifier(**param, random_state=seed)
    score = CV(model, oversampled_df, balanced_log_loss)
    return score

In [22]:
def adaboost(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 400, 500, 20),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.0001, 1),
        'base_estimator': DecisionTreeClassifier(max_depth=trial.suggest_int('max_depth', 1, 20)),
    }
    model = AdaBoostClassifier(**param, random_state=seed)
    score = CV(model, oversampled_df, balanced_log_loss)
    return score

In [23]:
def gradient(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 500, 20),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.0001, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'subsample': trial.suggest_float('subsample', 0.3, 1.0),
    }
    model = GradientBoostingClassifier(**param, random_state=seed)
    score = CV(model, oversampled_df, balanced_log_loss)
    return score

In [24]:
def tabpfn(trial):
    param = {
        'N_ensemble_configurations': trial.suggest_int('N_ensemble_configurations', 20, 70),
    }
    model = TabPFNClassifier(**param)
    score = CV(model, oversampled_df, balanced_log_loss)
    return score

In [None]:
def catboost_meta(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 150, 10),
        'reg_lambda': trial.suggest_loguniform('lambda', 1e-3, 0.1),
        'colsample_bylevel': trial.suggest_categorical('colsample_bylevel', [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4, 0.5, 0.6, 0.7, 0.8, 1.0]),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1e-1),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'one_hot_max_size': trial.suggest_int('one_hot_max_size', 2, 10),
    }
    model = StackingClassifier(
        estimators=[("catboost", cat_model), ("hist_gradient_boosting", hgbc_model),
                    ("lgbm", lgbm_model), ("xgboost", xgb_model)],
        final_estimator=CatBoostClassifier(**param, random_state=seed, verbose=False)
    )
    score = CV(model, oversampled_df, balanced_log_loss)
    return score

In [None]:
def xgb_meta(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200, 10),
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 0.1),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 0.1),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4, 0.5, 0.6, 0.7, 0.8, 1.0]),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 50),
    }
    model = StackingClassifier(
        estimators=[("catboost", cat_model), ("hist_gradient_boosting", hgbc_model),
                    ("lgbm", lgbm_model), ("xgboost", xgb_model)],
        final_estimator=xgboost.XGBClassifier(**param, random_state=seed)
    )
    score = CV(model, oversampled_df, balanced_log_loss)
    return score

In [None]:
def hgbc_meta(trial):
    param = {
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'max_iter': trial.suggest_int('max_iter', 50, 200, 10),
    }
    model = StackingClassifier(
        estimators=[("catboost", cat_model), ("hist_gradient_boosting", hgbc_model),
                    ("lgbm", lgbm_model), ("xgboost", xgb_model)],
        final_estimator=HistGradientBoostingClassifier(**param, random_state=seed)
    )
    score = CV(model, oversampled_df, balanced_log_loss)
    return score

In [None]:
def lgbm_meta(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200, 10),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 0.1),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 0.1),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'num_leaves' : trial.suggest_int('num_leaves', 10, 50),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 50),
    }
    model = StackingClassifier(
        estimators=[("catboost", cat_model), ("hist_gradient_boosting", hgbc_model),
                    ("lgbm", lgbm_model), ("xgboost", xgb_model)],
        final_estimator=lgb.LGBMClassifier(**param, random_state=seed)
    )
    score = CV(model, oversampled_df, balanced_log_loss)
    return score

In [None]:
#cv_scores = []
#for i in range(1, 70):
#    cv_scores.append(CV(TabPFNClassifier(N_ensemble_configurations=i).fit(X_train, y_train), oversampled_df, balanced_log_loss))
#    print("Finished scoring iteration: ", i)
#print(cv_scores)
# tabpfn_cv_scores = [0.09816634530404814, 0.06528742315606546, 0.07164710694330415, 0.0638588867076252, 0.06678059459477922, 
#                     0.06248073905402122, 0.06022244222278583, 0.05686026255684793, 0.05689105574304978, 0.055132598406416614, 
#                     0.05647036841993689, 0.05590923914786228, 0.05717749825124045, 0.056481389014045424, 0.05585544466795472, 
#                     0.05429705547774719, 0.05411289464998228, 0.053594943435855504, 0.05330166286267148, 0.05248949135011284, 
#                     0.05360971138750451, 0.053185855277044544, 0.05442991634568051, 0.054041848842559824, 0.05442855850281584, 
#                     0.054167697496943834, 0.05464900294529094, 0.05452444890518441, 0.05537642870820436, 0.05530201264890257, 
#                     0.05508454012654403, 0.05454951688798497, 0.055158959487388484, 0.05487638328640315, 0.05475942057273066, 
#                     0.054363820012922115, 0.05444429556509187, 0.05417317343226857, 0.05412096729914397, 0.05351279626269752, 
#                     0.05340841118461925, 0.053096957471938544, 0.053109073061545865, 0.052836328953108344, 0.05318516573142357, 
#                     0.053126580472636464, 0.05360923926329571, 0.053408860579671494, 0.05393021365358299, 0.05382474507832467, 
#                     0.05371423691544534, 0.05321754224273796, 0.053625478068834165, 0.05351067456999581, 0.053520923141210576, 
#                     0.05313957104926818, 0.053584265552239566, 0.05343326935461631, 0.05377785617124854, 0.05370929324862032, 
#                     0.053595077061113806, 0.053302678221229506, 0.053333914849297705, 0.053023531220070495, 0.0529506407276686, 
#                     0.052713977186699276, 0.05300799494093289, 0.05295979717175845, 0.05318406292878591]

In [None]:
# start = time.time()

# study1 = optuna.create_study(direction='minimize')
# n_trials = 75
# study1.optimize(tabpfn, n_trials=n_trials)
# print('Best trial:', study1.best_trial.params)
# print('Best values:', study1.best_value)

# end = time.time()
# print('It has taken {:.5f} seconds to search for the best Hyperparameter'.format(end-start))

## Creating Models

In [None]:
# CV(TabPFNClassifier(N_ensemble_configurations=24).fit(X_train, y_train), oversampled_df, balanced_log_loss)
# # for each test performance on 1s and 0s

In [None]:
#add way to do rule-based and adaptive ensembling
#see what others did

In [None]:
xgb_model = xgboost.XGBClassifier(n_estimators=200, reg_lambda=0.15325900166549988, reg_alpha=0.0044700650580230235, 
                              colsample_bytree=0.3, subsample=1.0, learning_rate=0.09975673376458177, 
                              max_depth=11, min_child_weight=2, random_state=seed).fit(X_train, y_train)

In [None]:
lgbm_model = lgb.LGBMClassifier(n_estimators=200, reg_alpha=0.0016725623110267532, reg_lambda=0.0038043774323061946, 
                                 colsample_bytree=0.3, subsample=0.4, learning_rate=0.09367295744238123, max_depth=11, 
                                 num_leaves=50, min_child_samples=26, random_state=seed).fit(X_train, y_train)

In [None]:
cat_model = CatBoostClassifier(n_estimators=70, reg_lambda=0.01606738047167, colsample_bylevel=0.3, 
                              subsample=0.7, learning_rate=0.0865881098465479, 
                              max_depth=9, one_hot_max_size=10, random_state=seed, verbose=False).fit(X_train, y_train)

In [None]:
hgbc_model = HistGradientBoostingClassifier(max_iter=170, max_depth=4, min_samples_leaf=9,
                                         learning_rate=0.17193627413211837, random_state=seed).fit(X_train, y_train)

In [None]:
dt_model = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=500, max_samples=0.994711990802652,
                           max_features=0.7844039008030275, bootstrap=False, bootstrap_features=True, random_state=seed).fit(X_train, y_train)

In [None]:
rf_model = RandomForestClassifier(n_estimators=420, max_depth=13, min_samples_split=3, 
                                  min_samples_leaf=1, max_features=9, bootstrap=False, random_state=seed).fit(X_train, y_train)

In [None]:
ada_model = AdaBoostClassifier(n_estimators=420, learning_rate=0.6794771047521856, 
                               base_estimator=DecisionTreeClassifier(max_depth=4), random_state=seed).fit(X_train, y_train)

In [None]:
gradient_model = GradientBoostingClassifier(n_estimators=170, learning_rate=0.05893342084840253, max_depth=6, 
                                            min_samples_split=16, min_samples_leaf=11, subsample=0.9112739951878389, random_state=seed).fit(X_train, y_train)

In [None]:
stacking_cat_model = CatBoostClassifier(n_estimators=150, reg_lambda=0.05055956136270572, colsample_bylevel=0.6, 
                                        subsample=0.5, learning_rate=0.08699165501504001, max_depth=7, 
                                        one_hot_max_size=8, random_state=seed, verbose=False).fit(X_train, y_train)

In [None]:
stacking_xgb_model = xgboost.XGBClassifier(n_estimators=160, reg_lambda=0.030554982480056614, alpha=0.022993963306149747, 
                                           colsample_bytree=0.4, subsample=0.6, learning_rate=0.08378145372235492, 
                                           max_depth=17, min_child_weight=1, random_state=seed).fit(X_train, y_train)

In [None]:
stacking_lgbm_model = lgb.LGBMClassifier(n_estimators=130, reg_alpha=0.017987440901161444, reg_lambda=0.0010110144342120994, 
                                colsample_bytree=0.8, subsample=0.5, learning_rate=0.08786840365732179, 
                                max_depth=5, num_leaves=10, min_child_samples=48, random_state=seed).fit(X_train, y_train)

In [None]:
stacking_hgbc_model = HistGradientBoostingClassifier(learning_rate=0.9112141545526848, max_depth=5, min_samples_leaf=18, 
                                                     max_iter=160, random_state=seed).fit(X_train, y_train)

## Submission

In [None]:
def round_up_down(num):
    return 1 if num >= 0.5 else 0

In [None]:
#final.iloc[:, 1], final.iloc[:, -1] = stacking_cat_model.predict_proba(test.iloc[:, 1:])[:, 0], stacking_cat_model.predict_proba(test.iloc[:, 1:])[:, 1]
final.iloc[:, 1] = (lgbm[:, 0] + xgb[:, 0])/2
final.iloc[:, -1] = (lgbm[:, 1] + xgb[:, 1])/2
final.to_csv('submission.csv', index=False)
submission = pd.read_csv('submission.csv')
submission