In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error, r2_score
from tqdm import tqdm
import joblib

In [3]:
trainData_file = './trainData_lightgbm.csv'
train_data = pd.read_csv(trainData_file, index_col=None)

In [4]:
train_data.head()

Unnamed: 0,y,x1,x2,x3,x4
0,0.0,0.001009,0.010692,0.4173,0.375397
1,-10.075567,0.003032,0.007614,0.385476,0.40819
2,-10.055304,0.00607,0.010671,0.377639,0.380974
3,0.0,0.0,0.009659,0.409563,0.377826
4,-10.060362,0.003534,0.006073,0.396567,0.416824


In [5]:
#train_data = train_data.dropna(subset=['x5'])

In [6]:
x_train, x_test, y_train, y_test = train_test_split(train_data.iloc[:, 1:].values, train_data.iloc[:, 0].values, test_size=0.2, \
                                                    shuffle = True, random_state = 2023)

In [7]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((832260, 4), (208065, 4), (832260,), (208065,))

LGBM

In [8]:
#model = xgb.XGBClassifier()
#model = RandomForestClassifier() Overfits
model = lgb.LGBMRegressor(metric='rmse')
model.fit(x_train, y_train)

In [9]:
print("RMSE on train:")
mean_squared_error(y_train, model.predict(x_train), squared=False)

RMSE on train:


28.205929858882982

In [10]:
print("RMSE on test:")
mean_squared_error(y_test, model.predict(x_test), squared=False)

RMSE on test:


28.133466309723772

In [11]:
print("R2 score on train:")
r2_score(y_train, model.predict(x_train))

R2 score on train:


0.029936270332639903

In [12]:
print("R2 score on test:")
r2_score(y_test, model.predict(x_test))

R2 score on test:


0.02015176689741427

LGBM Regressor Grid Search

In [14]:
# Define the model
model = lgb.LGBMRegressor(metric='rmse')

# Define the hyperparameter grid
param_grid = {
    'learning_rate': [0.05, 0.1, 0.2, 0.5, 0.9],
    'max_depth': [-1, 1, 2, 3, 4, 5],
    'num_leaves': [15, 31, 63, 127, 255],
    'n_estimators': [300, 500, 700, 900]
}

# Define the grid search
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='r2', verbose=5)

# Train the grid search
grid_search.fit(x_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the best model on train and test sets
y_train_pred = best_model.predict(x_train)
train_rmse = mean_squared_error(y_train, y_train_pred, squared=False)

y_test_pred = best_model.predict(x_test)
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)

print(f'Train RMSE: {train_rmse:.2f}')
print(f'Test RMSE: {test_rmse:.2f}')


Fitting 3 folds for each of 600 candidates, totalling 1800 fits
[CV 1/3] END learning_rate=0.05, max_depth=-1, n_estimators=300, num_leaves=15;, score=0.018 total time=   5.8s
[CV 2/3] END learning_rate=0.05, max_depth=-1, n_estimators=300, num_leaves=15;, score=0.018 total time=   8.6s
[CV 3/3] END learning_rate=0.05, max_depth=-1, n_estimators=300, num_leaves=15;, score=0.019 total time=   5.6s


KeyboardInterrupt: ignored

In [15]:
final_model1 = lgb.LGBMRegressor(metric='r2', learning_rate=0.05, max_depth=-1, n_estimators=700, num_leaves=255)
final_model2 = lgb.LGBMRegressor(metric='r2', learning_rate=0.05, max_depth=-1, n_estimators=900, num_leaves=255)

In [16]:
final_model1.fit(x_train, y_train)
final_model2.fit(x_train, y_train)

In [17]:
print("Final Model 1 R2 score on train:")
r2_score(y_train, final_model1.predict(x_train))

Final Model 1 R2 score on train:


0.1442914509847707

In [18]:
print("Final Model 1 R2 score on test:")
r2_score(y_test, final_model1.predict(x_test))

Final Model 1 R2 score on test:


0.011890949262863382

In [19]:
print("Final Model 2 R2 score on train:")
r2_score(y_train, final_model2.predict(x_train))

Final Model 2 R2 score on train:


0.1663670516410004

In [20]:
print("Final Model 2 R2 score on test:")
r2_score(y_test, final_model2.predict(x_test))

Final Model 2 R2 score on test:


0.008693667177893993

In [21]:
print("RMSE on test:")
mean_squared_error(y_test, final_model2.predict(x_test), squared=False)

RMSE on test:


28.29748106930482

In [None]:
joblib.dump(final_model2, './nnc_lightgbm_final.pkl')