In [1]:
def objective(trial, model_type, X, features, score_col, index_col, cv, sample_weights=False):
    # Parameter space to explore if model is xgboost
    if model_type == 'xgboost':
        params = {
            'objective': trial.suggest_categorical('objective', ['reg:tweedie', 'reg:pseudohubererror']),
            'random_state': SEED,
            'num_parallel_tree': trial.suggest_int('num_parallel_tree', 2, 30),
            'n_estimators': trial.suggest_int('n_estimators', 100, 300),
            'max_depth': trial.suggest_int('max_depth', 2, 4),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.02, 0.05),
            'subsample': trial.suggest_float('subsample', 0.5, 0.8),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.8),
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-5, 1e-1),
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-5, 1e-1),
        }
        if params['objective'] == 'reg:tweedie':
            params['tweedie_variance_power'] = trial.suggest_float('tweedie_variance_power', 1, 2)
        model = XGBRegressor(**params, use_label_encoder=False)
    
    # Parameter space to explore if model is lightgbm
    elif model_type == 'lightgbm':
        params = {
            'objective': trial.suggest_categorical('objective', ['poisson', 'tweedie', 'regression']),
            'random_state': SEED,
            'verbosity': -1,
            'n_estimators': trial.suggest_int('n_estimators', 100, 300),
            'max_depth': trial.suggest_int('max_depth', 2, 4),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.05),
            'subsample': trial.suggest_float('subsample', 0.5, 0.8),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.8),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 20, 100)
        }
        if params['objective'] == 'tweedie':
            params['tweedie_variance_power'] = trial.suggest_float('tweedie_variance_power', 1, 2)
        model = LGBMRegressor(**params)
    
    # Parameter space to explore if model is catboost
    elif model_type == 'catboost':
        params = {
            'loss_function': trial.suggest_categorical('objective', ['Tweedie:variance_power=1.5', 
                                                                     'Poisson', 'RMSE']),
            'random_state': SEED,
            'iterations': trial.suggest_int('iterations', 100, 300),
            'depth': trial.suggest_int('depth', 2, 4),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.05),
            'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 1e-1),
            'subsample': trial.suggest_float('subsample', 0.5, 0.7),
            'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
            'random_strength': trial.suggest_float('random_strength', 1e-3, 10.0),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 20, 60),
        }
        model = CatBoostRegressor(**params, verbose=0)
    
    else:
        raise ValueError(f"Unsupported model_type: {model_type}")
        
    seeds = [random.randint(1, 10000) for _ in range(20)]

    score, _ = n_cross_validate(model, X, features, score_col, index_col, cv, seeds, sample_weights=True, verbose=True)

    return score

def run_optimization(X, features, score_col, index_col, model_type, n_trials=30, cv=None, sample_weights=False):
    study = optuna.create_study(direction="maximize")
    study.optimize(lambda trial: objective(trial, model_type, X, features, score_col, index_col, cv, sample_weights), 
                   n_trials=n_trials)
    
    print(f"Best params for {model_type}: {study.best_params}")
    print(f"Best score: {study.best_value}")
    return study.best_params

In [None]:
exclude = ["PC_9", "PC_12", "Fitness_Endurance-Max_Stage", "Basic_Demos-Sex", 'BMI_mean_norm', "PC_11", "PC_8", "FGC_Zones_min", 'Physical-Systolic_BP',
           "PC_4", "BIA-BIA_FMI", "BIA-BIA_LST", "Physical-Diastolic_BP", 'BIA-BIA_ECW', 'Fitness_Endurance-Time_Mins', 'PAQ_C-PAQ_C_Total', 'PC_10',
           'BIA-BIA_Fat', 'FFM_norm', 'PC_14', 'PC_7']

reduced_features = [f for f in features if f not in exclude]

lgb_features = reduced_features
xgb_features = reduced_features
cat_features = reduced_features
print(len(reduced_features))

In [2]:
# Parameters for LGBM, XGB and CatBoost
lgb_params = {
    'objective': 'poisson', 
    'n_estimators': 295, 
    'max_depth': 4, 
    'learning_rate': 0.04505693066482616, 
    'subsample': 0.6042489155604022, 
    'colsample_bytree': 0.5021876720502726, 
    'min_data_in_leaf': 100
}

xgb_params = {'objective': 'reg:tweedie', 'num_parallel_tree': 12, 'n_estimators': 236, 'max_depth': 3, 'learning_rate': 0.04223740904479563, 'subsample': 0.7157264603586825, 'colsample_bytree': 0.7897918901977528, 'reg_alpha': 0.005335705058190553, 'reg_lambda': 0.0001897435318347022, 'tweedie_variance_power': 1.1393958601390142}

xgb_params_2 = {
    'objective': 'reg:tweedie', 
    'num_parallel_tree': 18, 
    'n_estimators': 175, 
    'max_depth': 3, 
    'learning_rate': 0.032620453423049305, 
    'subsample': 0.6155579670568023, 
    'colsample_bytree': 0.5988773292417443, 
    'reg_alpha': 0.0028895066837627205, 
    'reg_lambda': 0.002232531512636924, 
    'tweedie_variance_power': 1.1708678482038286
}

cat_params = {
    'objective': 'RMSE', 
    'iterations': 238, 
    'depth': 4, 
    'learning_rate': 0.044523361750173816, 
    'l2_leaf_reg': 0.09301285673435761, 
    'subsample': 0.6902492783438681, 
    'bagging_temperature': 0.3007304771330199, 
    'random_strength': 3.562201626987314, 
    'min_data_in_leaf': 60
}

xtrees_params = {
    'n_estimators': 500, 
    'max_depth': 15, 
    'min_samples_leaf': 20, 
    'bootstrap': False
}

In [None]:
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
if optimize_params:
    # LightGBM Optimization
    lgb_params = run_optimization(train, lgb_features, 'PCIAT-PCIAT_Total', 'sii', 'lightgbm', n_trials=n_trials, cv=kf, sample_weights=True)

    # XGBoost Optimization
    xgb_params = run_optimization(train, xgb_features, 'PCIAT-PCIAT_Total', 'sii', 'xgboost', n_trials=n_trials, cv=kf, sample_weights=True)

    # CatBoost Optimization
    cat_params = run_optimization(train, cat_features, 'PCIAT-PCIAT_Total', 'sii', 'catboost', n_trials=n_trials, cv=kf, sample_weights=True)

In [None]:
# Define models
lgb_model = LGBMRegressor(**lgb_params, random_state=SEED, verbosity=-1)
xgb_model = XGBRegressor(**xgb_params, random_state=SEED, verbosity=0)
xgb_model_2 = XGBRegressor(**xgb_params_2, random_state=SEED, verbosity=0)
cat_model = CatBoostRegressor(**cat_params, random_state=SEED, verbose=0)
xtrees_model = ExtraTreesRegressor(**xtrees_params, random_state=SEED)

weights = calculate_weights(train['PCIAT-PCIAT_Total'])

# Cross-validate LGBM model
score_lgb, oof_lgb, lgb_thresholds = cross_validate(
    lgb_model, train, lgb_features, 'PCIAT-PCIAT_Total', 'sii', kf, verbose=True, sample_weights=True
)
lgb_model.fit(train[lgb_features], train['PCIAT-PCIAT_Total'], sample_weight=weights)
test_lgb = lgb_model.predict(test[lgb_features])

# Cross-validate XGBoost model
score_xgb, oof_xgb, xgb_thresholds = cross_validate(
    xgb_model, train, xgb_features, 'PCIAT-PCIAT_Total', 'sii', kf, verbose=True, sample_weights=True
)
xgb_model.fit(train[xgb_features], train['PCIAT-PCIAT_Total'], sample_weight=weights)
test_xgb = xgb_model.predict(test[xgb_features])

# Cross-validate XGBoost model 2
score_xgb_2, oof_xgb_2, xgb_2_thresholds = cross_validate(
    xgb_model_2, train, xgb_features, 'PCIAT-PCIAT_Total', 'sii', kf, verbose=True, sample_weights=True
)
xgb_model_2.fit(train[xgb_features], train['PCIAT-PCIAT_Total'], sample_weight=weights)
test_xgb_2 = xgb_model_2.predict(test[xgb_features])

# Cross-validate CatBoost model
score_cat, oof_cat, cat_thresholds = cross_validate(
    cat_model, train, cat_features, 'PCIAT-PCIAT_Total', 'sii', kf, verbose=True, sample_weights=True
)
cat_model.fit(train[cat_features], train['PCIAT-PCIAT_Total'], sample_weight=weights)
test_cat = cat_model.predict(test[cat_features])

# Cross-validate ExtraTreesRegressor model
score_xtrees, oof_xtrees, xtrees_thresholds = cross_validate(
    xtrees_model, train, reduced_features, 'PCIAT-PCIAT_Total', 'sii', kf, verbose=True, sample_weights=True
)
xtrees_model.fit(train[reduced_features], train['PCIAT-PCIAT_Total'], sample_weight=weights)
test_xtrees = xtrees_model.predict(test[reduced_features])

# Print overall mean Kappa score for all models
print(f'Overall Mean Kappa: {np.mean([score_lgb, score_xgb, score_cat, score_xtrees])}') # Ensemble score likely higher