In [14]:
import pandas as pd
import lightgbm as lgb
from catboost import CatBoostRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import numpy as np
import optuna
from sklearn.model_selection import KFold

In [15]:
data=pd.read_csv('datafinal.csv')
data=data.drop(['t2mshan', 'sh850ct', 'sh700ct', 'u700ct', 'vo850ct', 'sktshan', 't2msib', 'u850ct', 'stct', 'rh700ct', 'sktsib', 'dz850sib', 'sstct', 'mslpsib', 'rh925ct', 'rh850ct', 'z850sib', 'sst', 'vo925ct', 'rh850', 'spsib', 'st', 'u925ct', 'u925', 'sh925ct', 'dmslpsib', 'dspsib', 'sh850', 'dspct', 'dz925sib', 'sh700', 'dmslpct', 'z925sib'],axis=1)
data.set_index('date', inplace = True)

In [16]:
train_size = int(0.8 * len(data))

y_train = data['cens'][:train_size]
X_train = data.drop('cens',axis=1)[:train_size]
y_test = data['cens'][train_size:]
X_test = data.drop('cens',axis=1)[train_size:]

target = data['cens']
feature = data.drop('cens',axis=1)

print("Ukuran data pelatihan yang telah diacak:", X_train.shape)

Ukuran data pelatihan yang telah diacak: (38185, 41)


Define HyperParameter Search and define optuna objective function

In [17]:
xgb_params = {
    'n_estimators': (100, 5000),
    'max_depth': (1, 10),
    'learning_rate': (0.0001, 0.1),
    'subsample': (0.5, 1),
    'reg_alpha': (0, 1),
    'reg_lambda': (0, 1),
    'gamma': (0, 1),
    'min_child_weight': (1, 10)
}

catboost_params = {
    'n_estimators': (100, 5000),
    'max_depth': (1, 10),
    'learning_rate': (0.0001, 0.1),
    'subsample': (0.5, 1),
    'rsm': (0.5, 1),
    'reg_lambda': (0, 1),
    'random_strength': (1, 10)
}

lgb_params = {
    'n_estimators': (100, 5000),
    'max_depth': (1, 10),
    'learning_rate': (0.0001, 0.1),
    'subsample': (0.5, 1),
    'reg_alpha': (0, 1),
    'reg_lambda': (0, 1),
    'colsample_bytree': (0.5, 1),
    'min_child_samples': (1, 20)
}


def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', *xgb_params['n_estimators']),
        'max_depth': trial.suggest_int('max_depth', *xgb_params['max_depth']),
        'learning_rate': trial.suggest_float('learning_rate', *xgb_params['learning_rate']),
        'subsample': trial.suggest_float('subsample', *xgb_params['subsample']),
        'reg_alpha': trial.suggest_float('reg_alpha', *xgb_params['reg_alpha']),
        'reg_lambda': trial.suggest_float('reg_lambda', *xgb_params['reg_lambda']),
        'gamma': trial.suggest_float('gamma', *xgb_params['gamma']),
        'min_child_weight': trial.suggest_int('min_child_weight', *xgb_params['min_child_weight'])
    }
    model = xgb.XGBRegressor(**params)
    model.fit(X_train, y_train, verbose=False)
    y_pred = model.predict(X_test)
    mse = r2_score(y_test, y_pred)
    return mse

def objective_catboost(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', *catboost_params['n_estimators']),
        'max_depth': trial.suggest_int('max_depth', *catboost_params['max_depth']),
        'learning_rate': trial.suggest_float('learning_rate', *catboost_params['learning_rate']),
        'subsample': trial.suggest_float('subsample', *catboost_params['subsample']),
        'rsm': trial.suggest_float('rsm', *catboost_params['rsm']),
        'reg_lambda': trial.suggest_float('reg_lambda', *catboost_params['reg_lambda']),
        'random_strength': trial.suggest_int('random_strength', *catboost_params['random_strength'])
    }
    model = CatBoostRegressor(**params)
    model.fit(X_train, y_train, verbose=False)
    y_pred = model.predict(X_test)
    mse = r2_score(y_test, y_pred)
    return mse

def objective_lgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', *lgb_params['n_estimators']),
        'max_depth': trial.suggest_int('max_depth', *lgb_params['max_depth']),
        'learning_rate': trial.suggest_float('learning_rate', *lgb_params['learning_rate']),
        'subsample': trial.suggest_float('subsample', *lgb_params['subsample']),
        'reg_alpha': trial.suggest_float('reg_alpha', *lgb_params['reg_alpha']),
        'reg_lambda': trial.suggest_float('reg_lambda', *lgb_params['reg_lambda']),
        'colsample_bytree': trial.suggest_float('colsample_bytree', *lgb_params['colsample_bytree']),
        'min_child_samples': trial.suggest_int('min_child_samples', *lgb_params['min_child_samples'])
    }
    model = lgb.LGBMRegressor(**params)
    model.fit(X_train, y_train, verbose=False)
    y_pred = model.predict(X_test)
    mse = r2_score(y_test, y_pred)
    return mse

def optimize_model(objective, n_trials):
    study = optuna.create_study(direction='maximize')  # Menggunakan 'minimize' untuk mencari MSE terkecil
    study.optimize(objective, n_trials=n_trials)
    best_params = study.best_params
    return best_params

best_lgb_params = optimize_model(objective_lgb, n_trials=50)
print('LightGBM - Best Hyperparameters:', best_lgb_params)

best_catboost_params = optimize_model(objective_catboost, n_trials=50)
print('CatBoost - Best Hyperparameters:', best_catboost_params)

best_xgb_params = optimize_model(objective_xgb, n_trials=50)
print('XGBoost - Best Hyperparameters:', best_xgb_params)


[32m[I 2023-07-29 00:05:13,011][0m A new study created in memory with name: no-name-59978b39-5afa-40d0-9116-46b99d983610[0m
[32m[I 2023-07-29 00:05:36,572][0m Trial 0 finished with value: 0.796433863360532 and parameters: {'n_estimators': 1345, 'max_depth': 10, 'learning_rate': 0.0467324007884916, 'subsample': 0.9068239285519033, 'reg_alpha': 0.8358103729425416, 'reg_lambda': 0.975093971737965, 'colsample_bytree': 0.727271935933674, 'min_child_samples': 12}. Best is trial 0 with value: 0.796433863360532.[0m
[32m[I 2023-07-29 00:05:52,628][0m Trial 1 finished with value: 0.7565913976452551 and parameters: {'n_estimators': 913, 'max_depth': 8, 'learning_rate': 0.003032445725667352, 'subsample': 0.8815970333565266, 'reg_alpha': 0.2666566089691801, 'reg_lambda': 0.49332809644525977, 'colsample_bytree': 0.6326154194550753, 'min_child_samples': 9}. Best is trial 0 with value: 0.796433863360532.[0m
[32m[I 2023-07-29 00:06:49,575][0m Trial 2 finished with value: 0.7997028239360231 a

[32m[I 2023-07-29 00:10:19,247][0m Trial 11 finished with value: 0.7935747492843803 and parameters: {'n_estimators': 2197, 'max_depth': 1, 'learning_rate': 0.09975652834964593, 'subsample': 0.6346003503246039, 'reg_alpha': 0.6746451575120375, 'reg_lambda': 0.21491773624886656, 'colsample_bytree': 0.9922994597964369, 'min_child_samples': 17}. Best is trial 7 with value: 0.8067059414740994.[0m
[32m[I 2023-07-29 00:10:39,467][0m Trial 12 finished with value: 0.7991457447107888 and parameters: {'n_estimators': 2743, 'max_depth': 3, 'learning_rate': 0.0931077230183351, 'subsample': 0.7962409357930242, 'reg_alpha': 0.6472770280514404, 'reg_lambda': 0.6717247633425857, 'colsample_bytree': 0.7904875143412788, 'min_child_samples': 15}. Best is trial 7 with value: 0.8067059414740994.[0m
[32m[I 2023-07-29 00:10:41,538][0m Trial 13 finished with value: 0.7889003963272944 and parameters: {'n_estimators': 116, 'max_depth': 7, 'learning_rate': 0.07472406631821697, 'subsample': 0.6289816509371

[32m[I 2023-07-29 00:12:49,126][0m Trial 22 finished with value: 0.8017754074620296 and parameters: {'n_estimators': 2404, 'max_depth': 4, 'learning_rate': 0.06213830596977289, 'subsample': 0.687923896923231, 'reg_alpha': 0.5495292971431415, 'reg_lambda': 0.9075264351324628, 'colsample_bytree': 0.5003800679497191, 'min_child_samples': 20}. Best is trial 16 with value: 0.8082354163850255.[0m
[32m[I 2023-07-29 00:13:05,405][0m Trial 23 finished with value: 0.8006594865142975 and parameters: {'n_estimators': 4250, 'max_depth': 2, 'learning_rate': 0.07460153064799481, 'subsample': 0.5778964419927335, 'reg_alpha': 0.7293234340668736, 'reg_lambda': 0.8777745795410683, 'colsample_bytree': 0.6082944910074988, 'min_child_samples': 14}. Best is trial 16 with value: 0.8082354163850255.[0m
[32m[I 2023-07-29 00:13:19,737][0m Trial 24 finished with value: 0.8038694806403415 and parameters: {'n_estimators': 3099, 'max_depth': 3, 'learning_rate': 0.06834361763675792, 'subsample': 0.50067234176

[32m[I 2023-07-29 00:15:44,027][0m Trial 34 finished with value: 0.7909170503827265 and parameters: {'n_estimators': 589, 'max_depth': 2, 'learning_rate': 0.05668593097302552, 'subsample': 0.6437558773320577, 'reg_alpha': 0.4907417932845489, 'reg_lambda': 0.8533724360327108, 'colsample_bytree': 0.6035885940774733, 'min_child_samples': 13}. Best is trial 16 with value: 0.8082354163850255.[0m
[32m[I 2023-07-29 00:16:12,405][0m Trial 35 finished with value: 0.7967453884732878 and parameters: {'n_estimators': 2227, 'max_depth': 5, 'learning_rate': 0.06921449549978187, 'subsample': 0.7219525995327588, 'reg_alpha': 0.3050969750578967, 'reg_lambda': 0.8059381509393364, 'colsample_bytree': 0.6511305529510102, 'min_child_samples': 16}. Best is trial 16 with value: 0.8082354163850255.[0m
[32m[I 2023-07-29 00:16:34,311][0m Trial 36 finished with value: 0.8007309364786153 and parameters: {'n_estimators': 3383, 'max_depth': 4, 'learning_rate': 0.05044736335206944, 'subsample': 0.55856715918

[32m[I 2023-07-29 00:18:10,892][0m Trial 45 finished with value: 0.8021719979074163 and parameters: {'n_estimators': 1656, 'max_depth': 4, 'learning_rate': 0.0599878964659873, 'subsample': 0.8266384150605808, 'reg_alpha': 0.6070547098569186, 'reg_lambda': 0.8952713198601993, 'colsample_bytree': 0.6481452396477221, 'min_child_samples': 14}. Best is trial 16 with value: 0.8082354163850255.[0m
[32m[I 2023-07-29 00:18:21,701][0m Trial 46 finished with value: 0.8064980166883251 and parameters: {'n_estimators': 2070, 'max_depth': 2, 'learning_rate': 0.07256187027444631, 'subsample': 0.7616761307388931, 'reg_alpha': 0.6253262226109154, 'reg_lambda': 0.8467858747072465, 'colsample_bytree': 0.537729936119458, 'min_child_samples': 11}. Best is trial 16 with value: 0.8082354163850255.[0m
[32m[I 2023-07-29 00:18:35,641][0m Trial 47 finished with value: 0.8010838872227163 and parameters: {'n_estimators': 3288, 'max_depth': 3, 'learning_rate': 0.07184785389772193, 'subsample': 0.771738600942

LightGBM - Best Hyperparameters: {'n_estimators': 2822, 'max_depth': 2, 'learning_rate': 0.07289267457871532, 'subsample': 0.8362101731189066, 'reg_alpha': 0.5294708841147577, 'reg_lambda': 0.8180626444557617, 'colsample_bytree': 0.572137249155041, 'min_child_samples': 20}


[32m[I 2023-07-29 00:21:39,969][0m Trial 0 finished with value: 0.7918607812795821 and parameters: {'n_estimators': 758, 'max_depth': 9, 'learning_rate': 0.06394352409361831, 'subsample': 0.7974306436006662, 'rsm': 0.8053398332099841, 'reg_lambda': 0.004108800361967702, 'random_strength': 6}. Best is trial 0 with value: 0.7918607812795821.[0m
[32m[I 2023-07-29 00:28:14,758][0m Trial 1 finished with value: 0.79215273040817 and parameters: {'n_estimators': 4608, 'max_depth': 8, 'learning_rate': 0.05217007795976401, 'subsample': 0.6723470448363513, 'rsm': 0.5108948286430401, 'reg_lambda': 0.4837234840018284, 'random_strength': 5}. Best is trial 1 with value: 0.79215273040817.[0m
[32m[I 2023-07-29 00:36:32,387][0m Trial 2 finished with value: 0.7943666155712614 and parameters: {'n_estimators': 3787, 'max_depth': 9, 'learning_rate': 0.014760094086431759, 'subsample': 0.671894374852553, 'rsm': 0.5320676758023126, 'reg_lambda': 0.7031557085839208, 'random_strength': 8}. Best is trial 

[32m[I 2023-07-29 00:59:48,838][0m Trial 24 finished with value: 0.8062425186512319 and parameters: {'n_estimators': 3217, 'max_depth': 2, 'learning_rate': 0.0514543843889448, 'subsample': 0.6260589466167392, 'rsm': 0.9748034205553712, 'reg_lambda': 0.6654058371221098, 'random_strength': 7}. Best is trial 6 with value: 0.8091990600422825.[0m
[32m[I 2023-07-29 01:00:35,769][0m Trial 25 finished with value: 0.8035780446936549 and parameters: {'n_estimators': 4240, 'max_depth': 2, 'learning_rate': 0.07191281558277843, 'subsample': 0.7191953094721653, 'rsm': 0.9361897757500047, 'reg_lambda': 0.5107166309149778, 'random_strength': 9}. Best is trial 6 with value: 0.8091990600422825.[0m
[32m[I 2023-07-29 01:01:00,100][0m Trial 26 finished with value: 0.7908448726998029 and parameters: {'n_estimators': 2991, 'max_depth': 1, 'learning_rate': 0.05547387645819496, 'subsample': 0.6351397051055037, 'rsm': 0.9169187216616614, 'reg_lambda': 0.6189611931045294, 'random_strength': 4}. Best is t

[32m[I 2023-07-29 01:24:58,898][0m Trial 48 finished with value: 0.8076737660040602 and parameters: {'n_estimators': 3373, 'max_depth': 6, 'learning_rate': 0.017583024393364684, 'subsample': 0.6093604736758588, 'rsm': 0.8201897680024379, 'reg_lambda': 0.5592624665180438, 'random_strength': 5}. Best is trial 39 with value: 0.8093989426275512.[0m
[32m[I 2023-07-29 01:25:33,013][0m Trial 49 finished with value: 0.7826012390982756 and parameters: {'n_estimators': 4407, 'max_depth': 1, 'learning_rate': 0.029965187444945196, 'subsample': 0.5576833342579371, 'rsm': 0.8699258690605345, 'reg_lambda': 0.717446575120248, 'random_strength': 3}. Best is trial 39 with value: 0.8093989426275512.[0m
[32m[I 2023-07-29 01:25:33,013][0m A new study created in memory with name: no-name-21c69c5e-6616-41d3-8db2-1724288d9214[0m


CatBoost - Best Hyperparameters: {'n_estimators': 4901, 'max_depth': 4, 'learning_rate': 0.018812193862372664, 'subsample': 0.5701092869059072, 'rsm': 0.7857242286786702, 'reg_lambda': 0.773020178734695, 'random_strength': 3}


[32m[I 2023-07-29 01:37:24,814][0m Trial 0 finished with value: 0.8002251402564684 and parameters: {'n_estimators': 4299, 'max_depth': 4, 'learning_rate': 0.04317902306492365, 'subsample': 0.6648683873318781, 'reg_alpha': 0.2164527111382445, 'reg_lambda': 0.5239512937736286, 'gamma': 0.44386097124128576, 'min_child_weight': 8}. Best is trial 0 with value: 0.8002251402564684.[0m
[32m[I 2023-07-29 01:56:16,323][0m Trial 1 finished with value: 0.7946823087742512 and parameters: {'n_estimators': 4826, 'max_depth': 6, 'learning_rate': 0.0965733048024456, 'subsample': 0.594378118405551, 'reg_alpha': 0.4769904690017993, 'reg_lambda': 0.355800610652114, 'gamma': 0.7850590364439346, 'min_child_weight': 1}. Best is trial 0 with value: 0.8002251402564684.[0m
[32m[I 2023-07-29 02:06:21,383][0m Trial 2 finished with value: 0.7970455763511575 and parameters: {'n_estimators': 4029, 'max_depth': 3, 'learning_rate': 0.00759635795398872, 'subsample': 0.8902946132548588, 'reg_alpha': 0.2325148031

[33m[W 2023-07-29 05:00:58,631][0m Trial 19 failed with value None.[0m


KeyboardInterrupt: 