# Data Preparation

In [None]:
import numpy as np
import pandas as pd


path_input = '/kaggle/input/uou-ie-g-03784-02-spring-2024-term-project/'
x_train = pd.read_csv(path_input+'x_train.csv', index_col=0)
x_test = pd.read_csv(path_input+'x_test.csv', index_col=0)
y_train = pd.read_csv(path_input+'y_train.csv', index_col=0)

In [None]:
x_train

In [None]:
y_train

In [None]:
x_test

In [None]:
x_test.describe()

In [None]:
x_train.describe()

# Modeling

In [None]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

x_trn, x_val, y_trn,y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [None]:
#use Optuna
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import optuna

def objective(trial):
    params = {
        "objective": "regression",
        'verbose': -1,
        'metric': 'rmse', 
        'max_depth': trial.suggest_int('max_depth',6, 10),
        'learning_rate': trial.suggest_float("learning_rate", 1e-6, 0.1,log = True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'min_child_samples': trial.suggest_int('min_child_samples', 4, 20),
        'subsample': trial.suggest_float('subsample', 0.4, 1),
        'num_leaves': trial.suggest_int('num_leaves',1, 150),
    }

    lgbr_model = lgb.LGBMRegressor(**params)
    lgbr_model.fit(x_trn, y_trn)
    predictions = lgbr_model.predict(x_val)
    rmse = mean_squared_error(y_val, predictions, squared=False)
    return rmse

#출처 : https://forecastegy.com/posts/xgboost-hyperparameter-tuning-with-optuna/

In [None]:
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=100)

In [None]:
# print('Best hyperparameters:', study.best_params)
# print('Best RMSE:', study.best_value)

In [None]:
from lightgbm import LGBMRegressor

lgbr_model = lgb.LGBMRegressor(verbose = -1,
                               n_estimators=264,  
                              max_depth =10,
                               learning_rate = 0.04416244268363925,
                               num_leaves = 49,
                               min_child_samples = 18,
                               subsample = 0.5023255318283741,
                              random_state=20)


lgbr_model.fit(x_trn, y_trn,
               eval_set=[(x_val, y_val)],
                eval_metric='rmse')    

In [None]:
pred_trn =lgbr_model.predict(x_trn)
pred_val =lgbr_model.predict(x_val)

In [None]:
rmse_trn = mean_squared_error(y_trn, pred_trn, squared=False)
rmse_val = mean_squared_error(y_val, pred_val, squared=False)
print('RMSE Train:', rmse_trn)
print('RMSE Validation:', rmse_val)

In [None]:
import optuna
import catboost as cbt

def objective(trial):
    params = {
        'iterations':trial.suggest_int("iterations", 1000, 3000),
        'depth':trial.suggest_int('depth', 4,10),
        'learning_rate': trial.suggest_float('learning_rate',0.0001,0.01),
        'min_child_samples':trial.suggest_int('min_child_samples',5,100),
        'random_strength': trial.suggest_float('random_strength',10,50),        
    }

    cbt_model = cbt.CatBoostRegressor(**params)
    cbt_model.fit(x_trn, y_trn,verbose_eval = False)
    predictions = cbt_model.predict(x_val)
    rmse = mean_squared_error(y_val, predictions, squared=False)
    return rmse

#출처 : https://forecastegy.com/posts/xgboost-hyperparameter-tuning-with-optuna/

In [None]:
# study_cat = optuna.create_study(direction='minimize')
# study_cat.optimize(objective, n_trials=50)

Trial 2 finished with value: 227.47004548671632 and parameters: {'iterations': 9868, 'depth': 11, 'learning_rate': 0.007680863081894013, 'min_child_samples': 95, 'random_strength': 24.390474800522284}. Best is trial 2 with value: 227.47004548671632

In [None]:
# print('Best hyperparameters:', study_cat.best_params)
# print('Best RMSE:', study_cat.best_value)

Best hyperparameters: {'iterations': 11413, 'depth': 12, 'learning_rate': 0.0073642714802098636, 'min_child_samples': 7, 'random_strength': 10.642296074805834}
Best RMSE: 225.61992004301044

In [None]:
# 출처 : https://catboost.ai/en/docs/concepts/python-usages-examples
#CatBoostRegressor
from catboost import CatBoostRegressor

# Initialize CatBoostRegressor
cbt_model = CatBoostRegressor(iterations = 2899,
                             learning_rate=0.009987417311301691,
                             depth=10,
                             min_child_samples = 92,
                             random_strength = 31.324688331990615,
                             eval_metric = 'RMSE',
                             loss_function = 'RMSE'
                             )

#Fit model
cbt_model.fit(x_trn,y_trn, verbose_eval = False)

In [None]:
# Get prediciton!
pred_trn = cbt_model.predict(x_trn)
pred_val = cbt_model.predict(x_val)

In [None]:
rmse_trn = mean_squared_error(y_trn, pred_trn, squared=False)
rmse_val = mean_squared_error(y_val, pred_val, squared=False)
print('RMSE Train:', rmse_trn)
print('RMSE Validation:', rmse_val)

In [None]:
#use Optuna
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import optuna

def objective(trial):
    params = {
        "objective": "reg:squarederror",
        "n_estimators": trial.suggest_int("n_estimators", 10, 500),
        "verbosity": 0,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
    }

    model = xgb.XGBRegressor(**params)
    model.fit(x_trn, y_trn, verbose=False)
    predictions = model.predict(x_val)
    rmse = mean_squared_error(y_val, predictions, squared=False)
    return rmse

#출처 : https://forecastegy.com/posts/xgboost-hyperparameter-tuning-with-optuna/

In [None]:
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=30)

In [None]:
# print('Best hyperparameters:', study.best_params)
# print('Best RMSE:', study.best_value)

In [None]:
import xgboost

xgb_model = xgboost.XGBRegressor(n_estimators=424, learning_rate=0.029676880697556064, gamma=0, subsample=0.9043584436880185,
                           colsample_bytree=0.806410126547652, max_depth=7, eval_metric = 'rmse', min_child_weight =2)
xgb_model.fit(x_trn,y_trn)

In [None]:
xgboost.plot_importance(xgb_model)

In [None]:
pred_trn =xgb_model.predict(x_trn)
pred_val =xgb_model.predict(x_val)

In [None]:
rmse_trn = mean_squared_error(y_trn, pred_trn, squared=False)
rmse_val = mean_squared_error(y_val, pred_val, squared=False)
print('RMSE Train:', rmse_trn)
print('RMSE Validation:', rmse_val)

In [None]:
#feature_names = x_train.columns
#importances = lgb_model.feature_importances_
#sns.barplot(y=feature_names, x=importances, estimator=np.mean)
#plt.title("Feature importances")
#plt.show()

# Model Ensenble

In [None]:
xgb_pred_trn = xgb_model.predict(x_trn)
xgb_pred_val = xgb_model.predict(x_val)

In [None]:
lgbr_pred_trn = lgbr_model.predict(x_trn)
lgbr_pred_val = lgbr_model.predict(x_val)

In [None]:
cbt_pred_trn = cbt_model.predict(x_trn)
cbt_pred_val = cbt_model.predict(x_val)

In [None]:
ensenble_pred_trn = xgb_pred_trn*0.3+ lgbr_pred_trn*0.3 + cbt_pred_trn * 0.4
ensenble_pred_val = xgb_pred_val*0.3+ lgbr_pred_val*0.3+ cbt_pred_val * 0.4

In [None]:
rmse_trn = mean_squared_error(y_trn, ensenble_pred_trn, squared=False)
rmse_val = mean_squared_error(y_val, ensenble_pred_val, squared=False)
print('RMSE Train:', rmse_trn)
print('RMSE Validation:', rmse_val)

# Submission

In [None]:
xgb_model.fit(x_train, y_train)

In [None]:
lgbr_model.fit(x_train, y_train)

In [None]:
cbt_model.fit(x_train, y_train, verbose_eval = False)

In [None]:
xgb_pred_test = xgb_model.predict(x_test)
lgbr_pred_test = lgbr_model.predict(x_test)
cbt_pred_test = cbt_model.predict(x_test)

In [None]:
ensenble_pred_test_mi = xgb_pred_test*0.3 + lgbr_pred_test*0.3 + cbt_pred_test *0.4

In [None]:
ensenble_pred_test = np.clip(ensenble_pred_test_mi,0,None)

In [None]:
result = pd.DataFrame(ensenble_pred_test, columns=['Rented Bike Count'])
result.index.name = 'ID'
result.to_csv('submission.csv', index_label='ID')