In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

import time

import lightgbm as lgb
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.ensemble import AdaBoostRegressor

import joblib

In [2]:
trainData_file = "Data/trainData.csv"
train_data = pd.read_csv(trainData_file, index_col=None)

In [3]:
train_data.describe()

Unnamed: 0,y,x1,x2,x3,x4,x5,x6,x7,x8,x9
count,1040325.0,1040325.0,1040325.0,1040325.0,1040325.0,1040325.0,1040325.0,1040325.0,1040325.0,1040325.0
mean,0.6860699,5.63015e-05,8.676045e-05,51538.23,50787.88,49784.01,303.7189,413.9831,502.9313,657.4186
std,28.59468,0.002917369,0.004005275,1566450.0,1562648.0,1551355.0,825.0121,1117.701,1374.042,1860.017
min,-544.4126,-0.05444126,-0.06036446,4.52831e-08,4.559682e-08,4.596638e-08,0.0,0.0,0.0,0.0
25%,-14.40922,-0.001481481,-0.002006689,0.3321165,0.3336418,0.3353439,54.68089,65.05382,72.56032,88.30062
50%,0.0,0.0,0.0,0.4612297,0.4627064,0.4644322,97.28823,133.2104,161.5271,209.5018
75%,14.08451,0.001416431,0.001858736,0.6371897,0.6379629,0.6390443,251.8849,353.8757,432.6049,565.947
max,773.3333,0.07733333,0.08602151,164013900.0,164013900.0,164013900.0,37708.55,54891.76,117411.3,180609.9


In [4]:
def model_test(list_of_models, list_of_model_names, train_data):
    
    x_train, x_test, y_train, y_test = train_test_split(train_data.iloc[:, 1:].values, 
                                                        train_data.iloc[:, 0].values, 
                                                        test_size = 0.2,
                                                        shuffle = True, 
                                                        random_state = 2023)
    
    print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
    print("")
    
    performance_df = pd.DataFrame(columns = ["Model", "Train RMSE", "Test RMSE", "Train R-squared", "Test R-squared"])
    counter = 0
    
    for model in list_of_models:
        
        start = time.time()
        model.fit(x_train, y_train)
        
        train_rmse = mean_squared_error(y_train, model.predict(x_train), squared = False)
        test_rmse = mean_squared_error(y_test, model.predict(x_test), squared = False)
        
        train_r_squared = r2_score(y_train, model.predict(x_train))
        test_r_squared = r2_score(y_test, model.predict(x_test))
        
        model_name = "pickle_models_2/" + list_of_model_names[counter] + "_Model.pkl"
        joblib.dump(model, model_name)
        
        performance_df.loc[len(performance_df)] = [list_of_model_names[counter], train_rmse, test_rmse, 
                                                   train_r_squared, test_r_squared]
        end = time.time()
        print("Time elapsed for", list_of_model_names[counter], "model:", (end - start), "seconds")
        print("Train RMSE:", train_rmse)
        print("Test RMSE:", test_rmse)
        print("Train R-squared:", train_r_squared)
        print("Test R-squared:", test_r_squared)
        print("")
        counter = counter + 1

    return performance_df


list_of_models = [lgb.LGBMRegressor(metric='rmse', random_state = 2023), LinearRegression(), 
                  Ridge(random_state = 2023), linear_model.Lasso(random_state = 2023), 
                  RandomForestRegressor(random_state = 2023), xgb.XGBRegressor(),
                  AdaBoostRegressor(random_state = 2023)]

list_of_model_names = ["LGBRegressor", "Linear_Regression", "Ridge_Regression", 
                       "Lasso_Regression", "Random_Forest_Regressor", "XGB_Regressor", 
                       "AdaBoost_Regresor"]

start_time = time.time()
performance_df = model_test(list_of_models, list_of_model_names, train_data)
end_time = time.time()
print("Total time elapsed for testing all models:", (end_time - start_time), "seconds")

(832260, 9) (208065, 9) (832260,) (208065,)

Time elapsed for LGBRegressor model: 2.3622519969940186 seconds
Train RMSE: 27.816368708636187
Test RMSE: 28.051372436569345
Train R-squared: 0.05391424058666039
Test R-squared: 0.03673629970660075

Time elapsed for Linear_Regression model: 0.39229750633239746 seconds
Train RMSE: 28.476081735867716
Test RMSE: 28.4650389330582
Train R-squared: 0.008505981567576537
Test R-squared: 0.008116809620553056

Time elapsed for Ridge_Regression model: 0.17392849922180176 seconds
Train RMSE: 28.478169982365962
Test RMSE: 28.468826009464596
Train R-squared: 0.0083605570777191
Test R-squared: 0.007852865677643206



  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


Time elapsed for Lasso_Regression model: 0.7714340686798096 seconds
Train RMSE: 28.58956188884649
Test RMSE: 28.57737906087276
Train R-squared: 0.0005878216380479362
Test R-squared: 0.0002722276341671215

Time elapsed for Random_Forest_Regressor model: 2954.35759806633 seconds
Train RMSE: 10.378740729203054
Test RMSE: 27.726902005762867
Train R-squared: 0.8682899353809729
Test R-squared: 0.0588915757841455

Time elapsed for XGB_Regressor model: 30.34407615661621 seconds
Train RMSE: 27.182158510548884
Test RMSE: 27.99955982701632
Train R-squared: 0.09656373801353191
Test R-squared: 0.04029142795757301

Time elapsed for AdaBoost_Regresor model: 96.27082347869873 seconds
Train RMSE: 35.69729354616443
Test RMSE: 35.77112866956931
Train R-squared: -0.558117545040042
Test R-squared: -0.5663984793847701

Total time elapsed for testing all models: 3084.8509225845337 seconds


In [5]:
performance_df

Unnamed: 0,Model,Train RMSE,Test RMSE,Train R-squared,Test R-squared
0,LGBRegressor,27.816369,28.051372,0.053914,0.036736
1,Linear_Regression,28.476082,28.465039,0.008506,0.008117
2,Ridge_Regression,28.47817,28.468826,0.008361,0.007853
3,Lasso_Regression,28.589562,28.577379,0.000588,0.000272
4,Random_Forest_Regressor,10.378741,27.726902,0.86829,0.058892
5,XGB_Regressor,27.182159,27.99956,0.096564,0.040291
6,AdaBoost_Regresor,35.697294,35.771129,-0.558118,-0.566398
