In [None]:
#Importing libraries
import pandas as pd
import numpy as np
from fbprophet import Prophet
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import plotly.express as px
import seaborn as sns
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error

In [None]:
#loading training data
data = pd.read_feather('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/train_feature_engineering.feather')
data.drop('index',axis=1,inplace=True)

### Drop the features which are not important

In [None]:
data.drop(['site_id','timestamp','wind_speed','wind_direction','is_summer_month','dew_temperature','relative_humidity','meter_reading','sea_level_pressure','cloud_coverage','precip_depth_1_hr','busy_hours','Sensible_Heat','discomfort_index','wind_chill','month'],axis=1,inplace=True)

In [None]:
x = data.drop('log_meter_reading',axis=1)
y = data['log_meter_reading']

### Splitting the data for training and validation

In [None]:
x_train,x_val,y_train,y_val=train_test_split(x,y,test_size=0.25,random_state=0)

### Hyperparameter Tuning

In [None]:
params={'max_depth':[3,5,7,9,11]}

lgb_reg=LGBMRegressor()
random_lgb=RandomizedSearchCV(lgb_reg,params,n_iter=8,scoring='neg_root_mean_squared_error',cv=3,verbose=1,random_state=42,n_jobs=-1)
random_lgb.fit(x_train,y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 12.0min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=LGBMRegressor(boosting_type='gbdt',
                                           class_weight=None,
                                           colsample_bytree=1.0,
                                           importance_type='split',
                                           learning_rate=0.1, max_depth=-1,
                                           min_child_samples=20,
                                           min_child_weight=0.001,
                                           min_split_gain=0.0, n_estimators=100,
                                           n_jobs=-1, num_leaves=31,
                                           objective=None, random_state=None,
                                           reg_alpha=0.0, reg_lambda=0.0,
                                           silent=True, subsample=1.0,
                                           subsample_for_bin=200000,
                                           subsa

In [None]:
random_lgb.best_params_

{'max_depth': 11}

In [None]:
random_lgb.best_score_

-1.287161843913947

In [None]:
params={'learning_rate':[0.1,0.01,0.03,0.05]}

lgb_reg=LGBMRegressor(max_depth=11)
random_lgb=RandomizedSearchCV(lgb_reg,params,n_iter=8,scoring='neg_root_mean_squared_error',cv=3,verbose=1,random_state=42,n_jobs=-1)
random_lgb.fit(x_train,y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 11.9min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=LGBMRegressor(boosting_type='gbdt',
                                           class_weight=None,
                                           colsample_bytree=1.0,
                                           importance_type='split',
                                           learning_rate=0.1, max_depth=11,
                                           min_child_samples=20,
                                           min_child_weight=0.001,
                                           min_split_gain=0.0, n_estimators=100,
                                           n_jobs=-1, num_leaves=31,
                                           objective=None, random_state=None,
                                           reg_alpha=0.0, reg_lambda=0.0,
                                           silent=True, subsample=1.0,
                                           subsample_for_bin=200000,
                                           subsa

In [None]:
random_lgb.best_params_

{'learning_rate': 0.1}

In [None]:
random_lgb.best_score_

-1.287161843913947

In [None]:
params={'colsample_bytree':[0.7,0.8,0.9,1.0],}

lgb_reg=LGBMRegressor(max_depth=11,learning_rate=0.1)
random_lgb=RandomizedSearchCV(lgb_reg,params,n_iter=8,scoring='neg_root_mean_squared_error',cv=3,verbose=1,random_state=42,n_jobs=-1)
random_lgb.fit(x_train,y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 10.1min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=LGBMRegressor(boosting_type='gbdt',
                                           class_weight=None,
                                           colsample_bytree=1.0,
                                           importance_type='split',
                                           learning_rate=0.1, max_depth=11,
                                           min_child_samples=20,
                                           min_child_weight=0.001,
                                           min_split_gain=0.0, n_estimators=100,
                                           n_jobs=-1, num_leaves=31,
                                           objective=None, random_state=None,
                                           reg_alpha=0.0, reg_lambda=0.0,
                                           silent=True, subsample=1.0,
                                           subsample_for_bin=200000,
                                           subsa

In [None]:
random_lgb.best_params_

{'colsample_bytree': 0.9}

In [None]:
random_lgb.best_score_

-1.2831192482126825

In [None]:
params={'n_estimators':[300,500,800,1200]}

lgb_reg=LGBMRegressor(max_depth=11,learning_rate=0.1,colsample_bytree=0.9)
random_lgb=RandomizedSearchCV(lgb_reg,params,n_iter=8,scoring='neg_root_mean_squared_error',cv=3,verbose=1,random_state=42,n_jobs=-1)
random_lgb.fit(x_train,y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 51.1min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=LGBMRegressor(boosting_type='gbdt',
                                           class_weight=None,
                                           colsample_bytree=0.9,
                                           importance_type='split',
                                           learning_rate=0.1, max_depth=11,
                                           min_child_samples=20,
                                           min_child_weight=0.001,
                                           min_split_gain=0.0, n_estimators=100,
                                           n_jobs=-1, num_leaves=31,
                                           objective=None, random_state=None,
                                           reg_alpha=0.0, reg_lambda=0.0,
                                           silent=True, subsample=1.0,
                                           subsample_for_bin=200000,
                                           subsa

In [None]:
random_lgb.best_params_

{'n_estimators': 1200}

In [None]:
random_lgb.best_score_

-0.8271364523279928

In [None]:
params={'min_child_samples':[50,100,150,200]}

lgb_reg=LGBMRegressor(max_depth=11,learning_rate=0.1,colsample_bytree=0.9,n_estimators=1200)
random_lgb=RandomizedSearchCV(lgb_reg,params,n_iter=8,scoring='neg_root_mean_squared_error',cv=3,verbose=1,random_state=42,n_jobs=-1)
random_lgb.fit(x_train,y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


### Fitting the model with the training data with best parameters

In [None]:
 lgb_model=LGBMRegressor(n_estimators=1200,min_child_samples=100,max_depth=11,learning_rate=0.1,colsample_bytree=0.9,n_jobs=-1)

In [None]:
lgb_model.fit(x_train,y_train)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.9,
              importance_type='split', learning_rate=0.1, max_depth=11,
              min_child_samples=100, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=1200, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

### Calculation of rmsle score on predicting validation data using best parameters 

In [None]:
preds = lgb_model.predict(x_val)

In [None]:
print('Validation RMSLE = ',np.sqrt(mean_squared_error(y_val , preds)))

Validation RMSLE =  0.8253499710180906


### Fitting the model on whole training data using best parameters

In [None]:
lgb_final = LGBMRegressor(n_estimators=1200,min_child_samples=100,max_depth=11,learning_rate=0.1,colsample_bytree=0.9,n_jobs=-1)
lgb_final.fit(x,y)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.9,
              importance_type='split', learning_rate=0.1, max_depth=11,
              min_child_samples=100, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=1200, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [None]:
#Loading test data
df = pd.read_feather('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/test_data_final.feather')
df.drop('index',axis=1,inplace=True)

### Drop the unimportant features

In [None]:
df.drop(['row_id','site_id','timestamp','wind_speed','wind_direction','is_summer_month','dew_temperature','relative_humidity','sea_level_pressure','cloud_coverage','precip_depth_1_hr','busy_hours','Sensible_Heat','discomfort_index','wind_chill','month'],axis=1,inplace=True)

In [None]:
df.drop(['hour_x','hour_y','day_x','day_y','month_x','month_y','days_in_month'],axis=1,inplace=True)

### Predicting on test data

In [None]:
y_test=lgb_final.predict(df)

In [None]:
y_test=np.expm1(y_test)

In [None]:
test = np.round(y_test,4)

In [None]:
test_df = pd.DataFrame(data=test,columns={'meter_reading'})
test_df['row_id'] = test_df.index
test_df = test_df[['row_id','meter_reading']]
test_df.head()

Unnamed: 0,row_id,meter_reading
0,0,143.8624
1,1,79.7317
2,2,7.4596
3,3,279.7759
4,4,1269.0989


### Meter_readings which are less than zero are set to zero because meter readings can not be than zero

In [None]:
for i in test_df[test_df['meter_reading']<0].index: 
    
    test_df['meter_reading'][i] = 0

In [None]:
test_df.to_csv('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/lgbm.csv',index=False,header=True)