In [None]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import plotly.express as px
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error

In [None]:
#loading data
data = pd.read_feather('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/train_feature_engineering.feather')
data.drop('index',axis=1,inplace=True)

### Drop the features which are not important

In [None]:
data.drop(['site_id','timestamp','wind_speed','wind_direction','is_summer_month','dew_temperature','relative_humidity','meter_reading','sea_level_pressure','cloud_coverage','precip_depth_1_hr','busy_hours','Sensible_Heat','discomfort_index','wind_chill','month'],axis=1,inplace=True)

In [None]:
x = data.drop('log_meter_reading',axis=1)
y = data['log_meter_reading']

### Splitting the data for training and validation

In [None]:
x_train,x_val,y_train,y_val=train_test_split(x,y,test_size=0.25,random_state=0)

### Hyperparameter Tuning

In [None]:
rf=RandomForestRegressor(n_jobs=-1)
params={'n_estimators':[60,80,100],
        'max_depth':[5,7,9]}
rf_clf=RandomizedSearchCV(rf,params,scoring='neg_root_mean_squared_error',n_jobs=8,cv=3,verbose=15,n_iter=5,random_state=0)
rf_clf.fit(x_train,y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   1 tasks      | elapsed: 126.2min
[Parallel(n_jobs=8)]: Done   2 out of  15 | elapsed: 129.1min remaining: 839.1min
[Parallel(n_jobs=8)]: Done   4 out of  15 | elapsed: 157.6min remaining: 433.4min
[Parallel(n_jobs=8)]: Done   6 out of  15 | elapsed: 206.5min remaining: 309.8min
[Parallel(n_jobs=8)]: Done   8 out of  15 | elapsed: 216.0min remaining: 189.0min
[Parallel(n_jobs=8)]: Done  10 out of  15 | elapsed: 293.4min remaining: 146.7min
[Parallel(n_jobs=8)]: Done  12 out of  15 | elapsed: 302.4min remaining: 75.6min
[Parallel(n_jobs=8)]: Done  15 out of  15 | elapsed: 345.9min finished


#### Best parameters

In [None]:
rf_clf.best_params_

{'max_depth': 9, 'n_estimators': 80}

#### Best score

In [None]:
rf_clf.best_score_

-1.4816741717505504

### Fitting the model with the training data with best parameters

In [None]:
rf_model = RandomForestRegressor(max_depth=9,n_estimators=80,n_jobs=-1)

In [None]:
rf_model.fit(x_train,y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=9, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=80, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

### Calculation of rmsle score on predicting validation data using best parameters 

In [None]:
preds = rf_model.predict(x_val)

In [None]:
print('Validation RMSLE = ',np.sqrt(mean_squared_error(y_val , preds)))

Validation RMSLE =  1.4823299791945617


### Fitting the model on whole training data using best parameters

In [None]:
rf_model_final = RandomForestRegressor(max_depth=9,n_estimators=80,n_jobs=-1)
rf_model_final.fit(x,y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=9, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=80, n_jobs=-1, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [None]:
#loading test data
df = pd.read_feather('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/test_data_final.feather')
df.drop('index',axis=1,inplace=True)

### Drop the unimportant features

In [None]:
df.drop(['row_id','site_id','timestamp','wind_speed','wind_direction','is_summer_month','dew_temperature','relative_humidity','sea_level_pressure','cloud_coverage','precip_depth_1_hr','busy_hours','Sensible_Heat','discomfort_index','wind_chill','month'],axis=1,inplace=True)

### Predicting on test data

In [None]:
y_test=rf_model_final.predict(df)

In [None]:
y_test=np.expm1(y_test)

In [None]:
test = np.round(y_test,4)

In [None]:
test_df = pd.DataFrame(data=test,columns={'meter_reading'})
test_df['row_id'] = test_df.index
test_df = test_df[['row_id','meter_reading']]
test_df.head()

Unnamed: 0,row_id,meter_reading
0,0,257.1415
1,1,127.9034
2,2,17.6007
3,3,360.6222
4,4,1104.9047


### Meter_readings which are less than zero are set to zero because meter readings can not be than zero

In [None]:
for i in test_df[test_df['meter_reading']<0].index:
    test_df['meter_reading'][i] = 0

In [None]:
test_df.to_csv('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/rf.csv',index=False,header=True)