In [None]:
#importing libraries
import pandas as pd
import numpy as np
from fbprophet import Prophet
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import plotly.express as px
import seaborn as sns
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor

In [None]:
#loading data
data = pd.read_feather('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/train_feature_engineering.feather')

### Drop the features which are not important

In [None]:
data.drop(['index','site_id','timestamp','wind_speed','wind_direction','is_summer_month','dew_temperature','relative_humidity','meter_reading','sea_level_pressure','cloud_coverage','precip_depth_1_hr','busy_hours','Sensible_Heat','discomfort_index','wind_chill','month'],axis=1,inplace=True)

In [None]:
x = data.drop('log_meter_reading',axis=1)
y = data['log_meter_reading']

In [None]:
x['year_built'] = x['year_built'].astype(np.float32)
x['air_temperature'] = x['air_temperature'].astype(np.float32)
x['horizsolar'] = x['horizsolar'].astype(np.float32)
y = y.astype(np.float32)

### Splitting the data for training and validation

In [None]:
x_train,x_val,y_train,y_val=train_test_split(x,y,test_size=0.25,random_state=0)

### Hyperparameter Tuning

In [None]:
cb=CatBoostRegressor()
params={'n_estimators':[500,1000,1500],
        'max _depth':[9,11,13]}
cb_clf=RandomizedSearchCV(cb,params,scoring='neg_root_mean_squared_error',n_jobs=-1,cv=3,verbose=15,n_iter=5,random_state=0)
cb_clf.fit(x_train,y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 107.6min
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed: 174.0min
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed: 174.1min
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed: 174.3min
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed: 214.3min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed: 244.4min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed: 244.5min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed: 279.6min
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed: 375.9min remaining: 187.9min
[Parallel(n_jobs=-1)]: Done  12 out of  15 | elapsed: 412.5min remaining: 103.1min
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 559.4min finished


Learning rate set to 0.158396
0:	learn: 1.9385318	total: 4.04s	remaining: 1h 40m 51s
1:	learn: 1.8283171	total: 8.01s	remaining: 1h 40m 1s
2:	learn: 1.7367302	total: 11.9s	remaining: 1h 38m 38s
3:	learn: 1.6654208	total: 15.7s	remaining: 1h 37m 51s
4:	learn: 1.6092103	total: 19.5s	remaining: 1h 37m
5:	learn: 1.5626295	total: 23.5s	remaining: 1h 37m 20s
6:	learn: 1.5266319	total: 27.2s	remaining: 1h 36m 41s
7:	learn: 1.4997231	total: 30.9s	remaining: 1h 36m 8s
8:	learn: 1.4776806	total: 34.6s	remaining: 1h 35m 27s
9:	learn: 1.4604693	total: 38.4s	remaining: 1h 35m 24s
10:	learn: 1.4426976	total: 42.3s	remaining: 1h 35m 30s
11:	learn: 1.4302848	total: 46.1s	remaining: 1h 35m 20s
12:	learn: 1.4111088	total: 50s	remaining: 1h 35m 24s
13:	learn: 1.4008063	total: 53.5s	remaining: 1h 34m 39s
14:	learn: 1.3910297	total: 57.5s	remaining: 1h 34m 53s
15:	learn: 1.3772598	total: 1m 1s	remaining: 1h 34m 52s
16:	learn: 1.3692533	total: 1m 5s	remaining: 1h 34m 45s
17:	learn: 1.3628319	total: 1m 8s	re

RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=<catboost.core.CatBoostRegressor object at 0x7f32c16e6cd0>,
                   iid='deprecated', n_iter=5, n_jobs=-1,
                   param_distributions={'max_depth': [9, 11, 13],
                                        'n_estimators': [500, 1000, 1500]},
                   pre_dispatch='2*n_jobs', random_state=0, refit=True,
                   return_train_score=False,
                   scoring='neg_root_mean_squared_error', verbose=15)

#### Best parameters

In [None]:
cb_clf.best_params_

{'max_depth': 13, 'n_estimators': 1500}

#### Best score

In [None]:
cb_clf.best_score_

-0.618357995479036

### Fitting the model with the training data with best parameters

In [None]:
cat_reg = CatBoostRegressor(max_depth=13,n_estimators=1500,task_type='GPU',learning_rate=0.1)
cat_reg.fit(x_train,y_train)

0:	learn: 1.9862532	total: 155ms	remaining: 3m 52s
1:	learn: 1.9587873	total: 249ms	remaining: 3m 6s
2:	learn: 1.8859754	total: 398ms	remaining: 3m 18s
3:	learn: 1.8853757	total: 427ms	remaining: 2m 39s
4:	learn: 1.8173939	total: 579ms	remaining: 2m 53s
5:	learn: 1.8156756	total: 638ms	remaining: 2m 38s
6:	learn: 1.7593108	total: 787ms	remaining: 2m 47s
7:	learn: 1.7589858	total: 817ms	remaining: 2m 32s
8:	learn: 1.7115840	total: 966ms	remaining: 2m 39s
9:	learn: 1.7113664	total: 995ms	remaining: 2m 28s
10:	learn: 1.6683162	total: 1.14s	remaining: 2m 34s
11:	learn: 1.6665846	total: 1.19s	remaining: 2m 28s
12:	learn: 1.6302068	total: 1.34s	remaining: 2m 33s
13:	learn: 1.6300827	total: 1.37s	remaining: 2m 25s
14:	learn: 1.5981558	total: 1.52s	remaining: 2m 30s
15:	learn: 1.5980736	total: 1.55s	remaining: 2m 23s
16:	learn: 1.5693295	total: 1.7s	remaining: 2m 28s
17:	learn: 1.5690453	total: 1.74s	remaining: 2m 23s
18:	learn: 1.5484085	total: 1.89s	remaining: 2m 27s
19:	learn: 1.5467531	tot

<catboost.core.CatBoostRegressor at 0x7fa3a629a850>

### Calculation of rmsle score on predicting validation data using best parameters 

In [None]:
print('Validation RMSLE = ',np.sqrt(mean_squared_error(y_val , cat_reg.predict(x_val))))

Train RMSLE =  0.8008933581505848


### Fitting the model on whole training data using best parameters

In [None]:
cb_reg_final = CatBoostRegressor(max_depth=13,n_estimators=1500,task_type='GPU',learning_rate=0.1)
cb_reg_final.fit(x,y)

0:	learn: 1.9880144	total: 168ms	remaining: 4m 11s
1:	learn: 1.9858429	total: 237ms	remaining: 2m 57s
2:	learn: 1.9079497	total: 402ms	remaining: 3m 20s
3:	learn: 1.8878316	total: 472ms	remaining: 2m 56s
4:	learn: 1.8244648	total: 640ms	remaining: 3m 11s
5:	learn: 1.8239043	total: 678ms	remaining: 2m 48s
6:	learn: 1.7682591	total: 841ms	remaining: 2m 59s
7:	learn: 1.7678819	total: 878ms	remaining: 2m 43s
8:	learn: 1.7182281	total: 1.03s	remaining: 2m 51s
9:	learn: 1.7043340	total: 1.11s	remaining: 2m 45s
10:	learn: 1.6633792	total: 1.28s	remaining: 2m 53s
11:	learn: 1.6632040	total: 1.32s	remaining: 2m 43s
12:	learn: 1.6277702	total: 1.48s	remaining: 2m 49s
13:	learn: 1.6276505	total: 1.52s	remaining: 2m 41s
14:	learn: 1.5972960	total: 1.69s	remaining: 2m 47s
15:	learn: 1.5939135	total: 1.76s	remaining: 2m 43s
16:	learn: 1.5682296	total: 1.92s	remaining: 2m 47s
17:	learn: 1.5681766	total: 1.96s	remaining: 2m 41s
18:	learn: 1.5471162	total: 2.13s	remaining: 2m 45s
19:	learn: 1.5470815	t

<catboost.core.CatBoostRegressor at 0x7fa3a6431f90>

In [None]:
#Loading test data
df = pd.read_feather('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/test_data_final.feather')
df.drop('index',axis=1,inplace=True)

### Drop the unimportant features

In [None]:
df.drop(['row_id','site_id','timestamp','wind_speed','wind_direction','is_summer_month','dew_temperature','relative_humidity','sea_level_pressure','cloud_coverage','precip_depth_1_hr','busy_hours','Sensible_Heat','discomfort_index','wind_chill','month'],axis=1,inplace=True)

In [None]:
df['year_built'] = df['year_built'].astype(np.float32)
df['air_temperature'] = df['air_temperature'].astype(np.float32)
df['horizsolar'] = df['horizsolar'].astype(np.float32)

### Predicting on test data

In [None]:
y_test=cb_reg_final.predict(df)

In [None]:
y_test=np.expm1(y_test)
test = np.round(y_test,4)

In [None]:
test_df = pd.DataFrame(data=test,columns={'meter_reading'})
test_df['row_id'] = test_df.index
test_df = test_df[['row_id','meter_reading']]
test_df.head()

Unnamed: 0,row_id,meter_reading
0,0,138.0369
1,1,45.6951
2,2,25.3193
3,3,218.7875
4,4,948.0522


### Meter_readings which are less than zero are set to zero because meter readings can not be than zero

In [None]:
for i in test_df[test_df['meter_reading']<0].index:
    test_df['meter_reading'][i] = 0

In [None]:
test_df.to_csv('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/catboostregressor.csv',index=False,header=True)