In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import time

import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.ensemble import AdaBoostRegressor

import joblib

In [2]:
trainData_file = "trainData_lightgbm.csv"
train_data = pd.read_csv(trainData_file, index_col=None)

In [3]:
train_data.describe()

Unnamed: 0,y,x1,x2,x3,x4
count,1040325.0,1040325.0,1040325.0,1040325.0,1040325.0
mean,0.6860699,5.63015e-05,8.676045e-05,50787.88,49784.01
std,28.59468,0.002917369,0.004005275,1562648.0,1551355.0
min,-544.4126,-0.05444126,-0.06036446,4.559682e-08,4.596638e-08
25%,-14.40922,-0.001481481,-0.002006689,0.3336418,0.3353439
50%,0.0,0.0,0.0,0.4627064,0.4644322
75%,14.08451,0.001416431,0.001858736,0.6379629,0.6390443
max,773.3333,0.07733333,0.08602151,164013900.0,164013900.0


In [4]:
def model_test(list_of_models, list_of_model_names, train_data):
    
    x_train, x_test, y_train, y_test = train_test_split(train_data.iloc[:, 1:].values, 
                                                        train_data.iloc[:, 0].values, 
                                                        test_size = 0.2,
                                                        shuffle = True, 
                                                        random_state = 2023)
    
    print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
    print("")
    
    performance_df = pd.DataFrame(columns = ["Model", "Train RMSE", "Test RMSE"])
    counter = 0
    
    for model in list_of_models:
        
        start = time.time()
        model.fit(x_train, y_train)
        train_rmse = mean_squared_error(y_train, model.predict(x_train), squared = False)
        test_rmse = mean_squared_error(y_test, model.predict(x_test), squared = False)
        
        model_name = list_of_model_names[counter] + "_Model.pkl"
        joblib.dump(model, model_name)
        
        performance_df.loc[len(performance_df)] = [model_name, train_rmse, test_rmse]
        end = time.time()
        print("Time elapsed for", list_of_model_names[counter], "model:", (end - start), "seconds")
        print("Train RMSE:", train_rmse)
        print("Test RMSE:", test_rmse)
        print("")
        counter = counter + 1

    return performance_df


list_of_models = [lgb.LGBMRegressor(metric='rmse', random_state = 2023), LinearRegression(), 
                  Ridge(random_state = 2023), linear_model.Lasso(random_state = 2023), 
                  RandomForestRegressor(random_state = 2023), xgb.XGBRegressor(),
                  AdaBoostRegressor(random_state = 2023)]

list_of_model_names = ["LGBRegressor", "Linear_Regression", "Ridge_Regression", 
                       "Lasso_Regression", "Random_Forest_Regressor", "XGB_Regressor", 
                       "AdaBoost_Regresor"]

start_time = time.time()
performance_df = model_test(list_of_models, list_of_model_names, train_data)
end_time = time.time()
print("Total time elapsed for testing all models:", (end_time - start_time), "seconds")

(832260, 4) (208065, 4) (832260,) (208065,)

Time elapsed for LGBRegressor model: 0.9360294342041016 seconds
Train RMSE: 28.201760963721842
Test RMSE: 28.131181910268356

Time elapsed for Linear_Regression model: 0.20128750801086426 seconds
Train RMSE: 28.527392475482177
Test RMSE: 28.302619429475676



  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


Time elapsed for Ridge_Regression model: 0.08464360237121582 seconds
Train RMSE: 28.52965395103355
Test RMSE: 28.30445828700264

Time elapsed for Lasso_Regression model: 0.22301030158996582 seconds
Train RMSE: 28.637736014722716
Test RMSE: 28.421159612185843

Time elapsed for Random_Forest_Regressor model: 938.7581896781921 seconds
Train RMSE: 10.846970800845439
Test RMSE: 28.780286707009452

Time elapsed for XGB_Regressor model: 25.669902563095093 seconds
Train RMSE: 27.752621871752336
Test RMSE: 28.205185743593788

Time elapsed for AdaBoost_Regresor model: 25.737364053726196 seconds
Train RMSE: 32.563520337321165
Test RMSE: 32.364925588142825

Total time elapsed for testing all models: 991.7409420013428 seconds


In [5]:
performance_df

Unnamed: 0,Model,Train RMSE,Test RMSE
0,LGBRegressor_Model.pkl,28.201761,28.131182
1,Linear_Regression_Model.pkl,28.527392,28.302619
2,Ridge_Regression_Model.pkl,28.529654,28.304458
3,Lasso_Regression_Model.pkl,28.637736,28.42116
4,Random_Forest_Regressor_Model.pkl,10.846971,28.780287
5,XGB_Regressor_Model.pkl,27.752622,28.205186
6,AdaBoost_Regresor_Model.pkl,32.56352,32.364926
