In [None]:
#importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [None]:
#loading training data 
data = pd.read_feather('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/train_feature_engineering.feather')
data.drop('index',axis=1,inplace=True)

### Drop the features which are not important

In [None]:
data.drop(['site_id','timestamp','wind_speed','wind_direction','is_summer_month','dew_temperature','relative_humidity','meter_reading','sea_level_pressure','cloud_coverage','precip_depth_1_hr','busy_hours','Sensible_Heat','discomfort_index','wind_chill','month'],axis=1,inplace=True)

In [None]:
x = data.drop('log_meter_reading',axis=1)
y = data['log_meter_reading']

### Splitting the data for training and validation

In [None]:
x_train,x_val,y_train,y_val=train_test_split(x,y,test_size=0.25,random_state=0)

### Hyperparameter Tuning

In [None]:
xgb_clf = XGBRegressor(tree_method='gpu_hist')
params = {'n_estimators':[1000,1500,2000],
        'learning_rate':[0.01,0.03,0.05,0.1],
        'max_depth':[3,5,7,9],
        'colsample_bytree':[0.5,0.8,0.9,1]}
xgb_model = RandomizedSearchCV(xgb_clf,params,scoring='neg_root_mean_squared_error',n_jobs=-1,cv=3,verbose=10,random_state=0,n_iter=10)
xgb_model.fit(x_train,y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 18.9min
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed: 32.3min remaining:  3.6min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 34.5min finished




RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=1,
                                          colsample_bynode=1,
                                          colsample_bytree=1, gamma=0,
                                          importance_type='gain',
                                          learning_rate=0.1, max_delta_step=0,
                                          max_depth=3, min_child_weight=1,
                                          missing=None, n_estimators=100,
                                          n_jobs=1, nthread=None,
                                          objective='reg:linear',
                                          random_state=0, reg_alpha=...
                                          seed=None, silent=None, subsample=1,
                                          tree_method='gpu_hist', verbosity=1),
                   iid='depre

#### Best parameters

In [None]:
xgb_model.best_params_

{'colsample_bytree': 0.8,
 'learning_rate': 0.03,
 'max_depth': 9,
 'n_estimators': 2000}

#### Best score

In [None]:
xgb_model.best_score_

-0.670367439587911

### Fitting the model with the training data with best parameters

In [None]:
xgb = XGBRegressor(n_estimators=2000,max_depth=9,learning_rate=0.03,colsample_bytree=0.8,tree_method='gpu_hist')

In [None]:
xgb.fit(x_train,y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, gamma=0,
             importance_type='gain', learning_rate=0.03, max_delta_step=0,
             max_depth=9, min_child_weight=1, missing=None, n_estimators=2000,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, tree_method='gpu_hist', verbosity=1)

### Calculation of rmsle score on predicting validation data using best parameters 

In [None]:
preds = xgb.predict(x_val)

In [None]:
print('Validation RMSLE = ',np.sqrt(mean_squared_error(y_val , preds)))

Validation RMSLE =  0.67230785


### Fitting the model on whole training data using best parameters

In [None]:
xgb_final = XGBRegressor(n_estimators=2000,max_depth=9,learning_rate=0.03,colsample_bytree=0.8,tree_method='gpu_hist')

In [None]:
xgb_final.fit(x,y)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, gamma=0,
             importance_type='gain', learning_rate=0.03, max_delta_step=0,
             max_depth=9, min_child_weight=1, missing=None, n_estimators=2000,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, tree_method='gpu_hist', verbosity=1)

In [None]:
xgb_final.save_model('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/xgb_model.json')


In [None]:
xgb_final.save_model('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/xgb.sav')

In [None]:
import joblib
#Save the models to a pickle file for making predictions on test data in future
filename='/content/drive/MyDrive/ashrae_Great_Energy_Prediction/xgb_model.sav'
joblib.dump(xgb_final,filename)

['/content/drive/MyDrive/ashrae_Great_Energy_Prediction/xgb_model.sav']

In [None]:
#loading test data
df = pd.read_feather('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/test_data_final.feather')
df.drop('index',axis=1,inplace=True)

### Drop the unimportant features

In [None]:
df.drop(['row_id','site_id','timestamp','wind_speed','wind_direction','is_summer_month','dew_temperature','relative_humidity','sea_level_pressure','cloud_coverage','precip_depth_1_hr','busy_hours','Sensible_Heat','discomfort_index','wind_chill','month'],axis=1,inplace=True)

### Predicting on test data

In [None]:
x.loc[0]

building_id          105.000000
meter                  0.000000
primary_use            0.000000
square_feet        50623.000000
year_built          1960.000000
day                    1.000000
air_temperature        3.800781
hour                   0.000000
weekday                4.000000
is_winter_month        1.000000
is_pub_holiday         1.000000
is_weekday             0.000000
horizsolar             0.000000
Name: 0, dtype: float64

In [None]:
m=y.head(1).values

In [None]:
m

array([3.191], dtype=float16)

In [None]:
np.expm1(m)

array([23.33], dtype=float16)

In [None]:
p=x.tail(1)

In [None]:
p = pd.DataFrame(p).reset_index()

In [None]:
p.to_csv('querypoint.csv',index=False)

In [None]:
c = pd.read_csv('/content/querypoint.csv')
c

Unnamed: 0,building_id,meter,primary_use,square_feet,year_built,day,air_temperature,hour,weekday,is_winter_month,is_pub_holiday,is_weekday,horizsolar
0,1448,0,6,92271,2001.0,366,1.7,23,5,1,0,0,0.0


In [None]:
y_test=xgb_final.predict(df)

In [None]:
y_test=np.expm1(y_test)

In [None]:
test = np.round(y_test,4)

In [None]:
test_df = pd.DataFrame(data=test,columns={'meter_reading'})
test_df['row_id'] = test_df.index
test_df = test_df[['row_id','meter_reading']]
test_df.head()

Unnamed: 0,row_id,meter_reading
0,0,166.216293
1,1,86.510902
2,2,4.4275
3,3,346.088196
4,4,2271.125977


### Meter_readings which are less than zero are set to zero because meter readings can not be than zero

In [None]:
for i in test_df[test_df['meter_reading']<0].index:
    test_df['meter_reading'][i] = 0

In [None]:
test_df.to_csv('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/xgboost.csv',index=False,header=True)