In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-0.26-cp37-none-manylinux1_x86_64.whl (69.2 MB)
[K     |████████████████████████████████| 69.2 MB 8.3 kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.26


In [None]:
#importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
from sklearn.utils import resample
from tqdm import tqdm
from lightgbm import LGBMRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LinearRegression,Ridge
import random

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#loading training data
df = pd.read_feather('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/train_feature_engineering.feather')
df.drop('index',axis=1,inplace=True)

### Drop the features which are not important

In [None]:
df.drop(['site_id','timestamp','wind_speed','wind_direction','is_summer_month','dew_temperature','relative_humidity','meter_reading','sea_level_pressure','cloud_coverage','precip_depth_1_hr','busy_hours','Sensible_Heat','discomfort_index','wind_chill','month'],axis=1,inplace=True)

### Hypertuning K models

In [None]:
#base models
base_models = [LinearRegression(fit_intercept=True,normalize=True,copy_X=True),
               RandomForestRegressor(max_depth=9,n_estimators=80,n_jobs=-1),
               XGBRegressor(n_estimators=1500,max_depth=9,learning_rate=0.03,colsample_bytree=0.8,tree_method='gpu_hist',silent=True),
               CatBoostRegressor(max_depth=13,n_estimators=1500,task_type='GPU',learning_rate=0.1,silent=True),
               LGBMRegressor(n_estimators=1200,min_child_samples=100,max_depth=11,learning_rate=0.1,colsample_bytree=0.9,n_jobs=-1,silent=True),
               Ridge(solver = "lsqr", fit_intercept=True,alpha = 10000)]

## Considering with 100 models

In [None]:
#Taking a dataset of 500000 datapoints and splitting dataset into 80% training and 20% testing
df = df[:500000]
split_size = int(len(df)*0.8)
train , test = df[0:split_size] , df[split_size:]

#splitting training data into 50% D1 and 50% D2
D_split_size = int(len(train)*0.5)
D1 , D2 = train[0:D_split_size] , train[D_split_size:]

D1_x = D1.drop('log_meter_reading',axis=1)
D1_y = D1['log_meter_reading'].astype(np.float32)
D2_x = D2.drop('log_meter_reading',axis=1)
D2_y = D2['log_meter_reading'].astype(np.float32)
test_x = test.drop('log_meter_reading',axis=1)
test_y = test['log_meter_reading'].astype(np.float32)


#From this D1, doing sampling with replacement to create d1,d2,d3....d100(100 samples).
x_100_samples = []
y_100_samples = []
for i in tqdm(list(range(100))):
    x , y = resample(D1_x,D1_y,replace=True,random_state=i,n_samples=10000)
    x_100_samples.append(x)
    y_100_samples.append(y)

100%|██████████| 100/100 [00:00<00:00, 555.26it/s]


In [None]:
D2_predict = []
test_predict = []
models = []   
def ensemble_regressor(x,y,D2_x,test_x,model):
    model = model.fit(x,y)
    D2_predict.append(pd.DataFrame(model.predict(D2_x)))
    test_predict.append(pd.DataFrame(model.predict(test_x)))
    models.append(model)

In [None]:
for i in tqdm(list(range(100))):     
    ensemble_regressor(x_100_samples[i],y_100_samples[i] ,D2_x,test_x  ,model = random.choice(base_models))

100%|██████████| 100/100 [20:13<00:00, 12.14s/it]


In [None]:
#Creating a new dataset for these 100 predictions of D2
D2_prediction = pd.DataFrame(D2_predict[0])
for i in tqdm(list(range(99))): 
    D2_prediction = pd.concat([D2_prediction, D2_predict[i+1]], axis=1)

100%|██████████| 99/99 [00:05<00:00, 17.62it/s]


In [None]:
#Creating a new dataset for these 100 predictions of test_data
test_prediction = pd.DataFrame(test_predict[0])
for i in tqdm(list(range(99))):
    test_prediction = pd.concat([test_prediction, test_predict[i+1]], axis=1)

100%|██████████| 99/99 [00:03<00:00, 29.32it/s]


In [None]:
D2_prediction.to_csv('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/Ensemble_Regressor/D2_prediction_100.csv',header=False, index=False)

In [None]:
test_prediction.to_csv('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/Ensemble_Regressor/test_prediction_100.csv',header=False, index=False)

In [None]:
D2_prediction = pd.read_csv('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/Ensemble_Regressor/D2_prediction_100.csv', header=None)
test_prediction = pd.read_csv('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/Ensemble_Regressor/test_prediction_100.csv',header=None)

In [None]:
D2_pred = D2_prediction.values
test_pred = test_prediction.values

In [None]:
base_models = [RandomForestRegressor(max_depth=9,n_estimators=80,n_jobs=-1),
               XGBRegressor(n_estimators=1500,max_depth=9,learning_rate=0.03,colsample_bytree=0.8,tree_method='gpu_hist',silent=True),
               CatBoostRegressor(max_depth=13,n_estimators=1500,task_type='GPU',learning_rate=0.1,silent=True),
               LGBMRegressor(n_estimators=1200,min_child_samples=100,max_depth=11,learning_rate=0.1,colsample_bytree=0.9,n_jobs=-1,silent=True)]


def best_model(base_models,D2_pred,D2_y,test_pred):
    for clf in tqdm(base_models):
        model = clf.fit(D2_pred,D2_y)
        train_rmsle = np.sqrt(mean_squared_error(D2_y ,model.predict(D2_pred)))
        test_rmsle = np.sqrt(mean_squared_error(test_y , model.predict(test_pred)))
        print('\nTrain_rmsle : '+str(train_rmsle)+'  Test_rmsle : '+str(test_rmsle))

In [None]:
best_model(base_models,D2_pred,D2_y,test_pred)

 25%|██▌       | 1/4 [06:02<18:06, 362.14s/it]


Train_rmsle : 0.6133195988732062  Test_rmsle : 0.7704567825570275


 50%|█████     | 2/4 [06:40<08:49, 264.93s/it]


Train_rmsle : 0.16304833  Test_rmsle : 0.6615613


 75%|███████▌  | 3/4 [07:26<03:19, 199.34s/it]


Train_rmsle : 0.3385331204957171  Test_rmsle : 0.6833999892428813


100%|██████████| 4/4 [09:02<00:00, 135.61s/it]


Train_rmsle : 0.3406604850994118  Test_rmsle : 0.6630891887206014





## Considering 300 models

In [None]:
#Taking a dataset of 500000 datapoints and splitting dataset into 80% training and 20% testing
df = df[:500000]
split_size = int(len(df)*0.8)
train , test = df[0:split_size] , df[split_size:]

#splitting training data into 50% D1 and 50% D2
D_split_size = int(len(train)*0.5)
D1 , D2 = train[0:D_split_size] , train[D_split_size:]

D1_x = D1.drop('log_meter_reading',axis=1)
D1_y = D1['log_meter_reading'].astype(np.float32)
D2_x = D2.drop('log_meter_reading',axis=1)
D2_y = D2['log_meter_reading'].astype(np.float32)
test_x = test.drop('log_meter_reading',axis=1)
test_y = test['log_meter_reading'].astype(np.float32)


#From this D1, doing sampling with replacement to create d1,d2,d3....d300(300 samples).
x_300_samples = []
y_300_samples = []
for i in tqdm(list(range(300))):
    x , y = resample(D1_x,D1_y,replace=True,random_state=i,n_samples=10000)
    x_300_samples.append(x)
    y_300_samples.append(y)

100%|██████████| 300/300 [00:00<00:00, 549.69it/s]


In [None]:
for i in tqdm(list(range(300))):     
    ensemble_regressor(x_300_samples[i],y_300_samples[i] ,D2_x,test_x  ,model = random.choice(base_models))

In [None]:
#Creating a new dataset for these 300 predictions of D2
D2_prediction = pd.DataFrame(D2_predict[0])
for i in tqdm(list(range(299))): 
    D2_prediction = pd.concat([D2_prediction, D2_predict[i+1]], axis=1)

100%|██████████| 299/299 [01:05<00:00,  4.57it/s]


In [None]:
#Creating a new dataset for these 300 predictions of test_data
test_prediction = pd.DataFrame(test_predict[0])
for i in tqdm(list(range(299))):
    test_prediction = pd.concat([test_prediction, test_predict[i+1]], axis=1)

100%|██████████| 299/299 [00:36<00:00,  8.23it/s]


In [None]:
D2_prediction.to_csv('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/Ensemble_Regressor/D2_prediction_300.csv',header=False, index=False)

In [None]:
test_prediction.to_csv('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/Ensemble_Regressor/test_prediction_300.csv',header=False, index=False)

In [None]:
D2_prediction = pd.read_csv('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/Ensemble_Regressor/D2_prediction_300.csv', header=None)
test_prediction = pd.read_csv('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/Ensemble_Regressor/test_prediction_3 00.csv',header=None)

In [None]:
D2_pred = D2_prediction.values
test_pred = test_prediction.values

In [None]:
best_model(base_models,D2_pred,D2_y,test_pred)

 25%|██▌       | 1/4 [18:21<55:05, 1101.84s/it]


Train_rmsle : 0.5732932979856016  Test_rmsle : 0.7466561663273591


 50%|█████     | 2/4 [19:18<26:16, 788.37s/it] 


Train_rmsle : 0.13176973  Test_rmsle : 0.6440852


 75%|███████▌  | 3/4 [20:50<09:39, 579.47s/it]


Train_rmsle : 0.37637886100006446  Test_rmsle : 0.6870980567884488


100%|██████████| 4/4 [25:09<00:00, 377.29s/it]


Train_rmsle : 0.30131019195602926  Test_rmsle : 0.6481653139252237





## Considering with 500 models

In [None]:
#Taking a dataset of 500000 datapoints and splitting dataset into 80% training and 20% testing
df = df[:500000]
split_size = int(len(df)*0.8)
train , test = df[0:split_size] , df[split_size:]

#splitting training data into 50% D1 and 50% D2
D_split_size = int(len(train)*0.5)
D1 , D2 = train[0:D_split_size] , train[D_split_size:]

D1_x = D1.drop('log_meter_reading',axis=1)
D1_y = D1['log_meter_reading'].astype(np.float32)
D2_x = D2.drop('log_meter_reading',axis=1)
D2_y = D2['log_meter_reading'].astype(np.float32)
test_x = test.drop('log_meter_reading',axis=1)
test_y = test['log_meter_reading'].astype(np.float32)


#From this D1, doing sampling with replacement to create d1,d2,d3....d500(500 samples).
x_500_samples = []
y_500_samples = []
for i in tqdm(list(range(500))):
    x , y = resample(D1_x,D1_y,replace=True,random_state=i,n_samples=10000)
    x_500_samples.append(x)
    y_500_samples.append(y)

100%|██████████| 500/500 [00:00<00:00, 594.47it/s]


In [None]:
for i in tqdm(list(range(500))):   
    ensemble_regressor(x_500_samples[i],y_500_samples[i] ,D2_x,test_x  ,model = random.choice(base_models))

100%|██████████| 500/500 [1:45:58<00:00, 12.72s/it]


In [None]:
#Creating a new dataset for these 500 predictions of D2
D2_prediction = pd.DataFrame(D2_predict[0])
for i in tqdm(list(range(499))): 
    D2_prediction = pd.concat([D2_prediction, D2_predict[i+1]], axis=1)

100%|██████████| 499/499 [03:25<00:00,  2.43it/s]


In [None]:
#Creating a new dataset for these 500 predictions of test_data
test_prediction = pd.DataFrame(test_predict[0])
for i in tqdm(list(range(499))):
    test_prediction = pd.concat([test_prediction, test_predict[i+1]], axis=1)

100%|██████████| 499/499 [01:42<00:00,  4.88it/s]


In [None]:
D2_prediction.to_csv('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/Ensemble_Regressor/D2_prediction.csv',header=False, index=False)

In [None]:
test_prediction.to_csv('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/Ensemble_Regressor/test_prediction.csv',header=False, index=False)

In [None]:
D2_prediction = pd.read_csv('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/Ensemble_Regressor/D2_prediction.csv', header=None)
test_prediction = pd.read_csv('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/Ensemble_Regressor/test_prediction.csv',header=None)

In [None]:
test_pred = test_prediction.values

In [None]:
D2_pred = D2_prediction.values

In [None]:
#finding best_model for meta_model
base_models = [RandomForestRegressor(max_depth=9,n_estimators=80,n_jobs=-1),
               XGBRegressor(n_estimators=1500,max_depth=9,learning_rate=0.03,colsample_bytree=0.8,tree_method='gpu_hist',silent=True),
               CatBoostRegressor(max_depth=13,n_estimators=1500,task_type='GPU',learning_rate=0.1,silent=True),
               LGBMRegressor(n_estimators=1200,min_child_samples=100,max_depth=11,learning_rate=0.1,colsample_bytree=0.9,n_jobs=-1,silent=True)]


def best_model(base_models,D2_pred,D2_y,test_pred):
    for clf in tqdm(base_models):
        model = clf.fit(D2_pred,D2_y)
        train_rmsle = np.sqrt(mean_squared_error(D2_y ,model.predict(D2_pred)))
        test_rmsle = np.sqrt(mean_squared_error(test_y , model.predict(test_pred)))
        print('\nTrain_rmsle : '+str(train_rmsle)+'  Test_rmsle : '+str(test_rmsle))

In [None]:
best_model(base_models,D2_pred,D2_y,test_pred)

 25%|██▌       | 1/4 [29:55<1:29:45, 1795.32s/it]


Train_rmsle : 0.5623341999136532  Test_rmsle : 0.7417063161225945


 50%|█████     | 2/4 [31:14<42:40, 1280.47s/it]  


Train_rmsle : 0.12772349  Test_rmsle : 0.6547555


 75%|███████▌  | 3/4 [33:34<15:38, 938.44s/it] 


Train_rmsle : 0.3652034464536798  Test_rmsle : 0.6837101870451242


100%|██████████| 4/4 [40:21<00:00, 605.41s/it]


Train_rmsle : 0.29219528263101946  Test_rmsle : 0.6438341092410215





In [None]:
from prettytable import PrettyTable
  
# Specify the Column Names while initializing the Table
myTable = PrettyTable(["No of base_models","Model","Train RMSLE","Test RMSLE"])
  
# Add rows
myTable.add_row(["100", "RandomForestRegressor", "0.613", "0.77"])
myTable.add_row(["100", "XGBRegressor", "0.163", "0.661"])
myTable.add_row(["100", "CatBoostRegressor", "0.338", "0.683"])
myTable.add_row(["100", "LGBMRegressor", "0.34", "0.663"])
myTable.add_row(["300", "RandomForestRegressor", "0.573", "0.746"])
myTable.add_row(["300", "XGBRegressor", "0.131", "0.644"])
myTable.add_row(["300", "CatBoostRegressor", "0.376", "0.687"])
myTable.add_row(["300", "LGBMRegressor", "0.301", "0.648"])
myTable.add_row(["500", "RandomForestRegressor", "0.562", " 0.741"])
myTable.add_row(["500", "XGBRegressor", "0.127", "0.654"])
myTable.add_row(["500", "CatBoostRegressor", "0.365", "0.683"])
myTable.add_row(["500", "LGBMRegressor", "0.292", "0.643"])

  
print(myTable)

+-------------------+-----------------------+-------------+------------+
| No of base_models |         Model         | Train RMSLE | Test RMSLE |
+-------------------+-----------------------+-------------+------------+
|        100        | RandomForestRegressor |    0.613    |    0.77    |
|        100        |      XGBRegressor     |    0.163    |   0.661    |
|        100        |   CatBoostRegressor   |    0.338    |   0.683    |
|        100        |     LGBMRegressor     |     0.34    |   0.663    |
|        300        | RandomForestRegressor |    0.573    |   0.746    |
|        300        |      XGBRegressor     |    0.131    |   0.644    |
|        300        |   CatBoostRegressor   |    0.376    |   0.687    |
|        300        |     LGBMRegressor     |    0.301    |   0.648    |
|        500        | RandomForestRegressor |    0.562    |    0.741   |
|        500        |      XGBRegressor     |    0.127    |   0.654    |
|        500        |   CatBoostRegressor   |    0.

*  Base_models with 500 and LGBRegressor giving best results

In [None]:
#hypertuning with best_model
lgbm_clf = LGBMRegressor(tree_method='gpu_hist')
params = {'n_estimators':[800,1200],
        'learning_rate':[0.01,0.05,0.1],
        'max_depth':[7,9,11],
        'colsample_bytree':[0.8,0.9,1]}
lgbm_model = RandomizedSearchCV(lgbm_clf,params,scoring='neg_root_mean_squared_error',n_jobs=-1,cv=3,verbose=10,random_state=0,n_iter=10)
lgbm_model.fit(D2_pred,D2_y)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed: 13.9min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 19.5min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 32.8min
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed: 47.6min remaining:  5.3min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 50.3min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=LGBMRegressor(boosting_type='gbdt',
                                           class_weight=None,
                                           colsample_bytree=1.0,
                                           importance_type='split',
                                           learning_rate=0.1, max_depth=-1,
                                           min_child_samples=20,
                                           min_child_weight=0.001,
                                           min_split_gain=0.0, n_estimators=100,
                                           n_jobs=-1, num_leaves=31,
                                           objective=None, random_state=None,
                                           reg_alpha=0.0, reg_lambda=0.0,
                                           silen...
                                           subsample_for_bin=200000,
                                           subsample_freq=0,
      

In [None]:
lgbm_model.best_params_

{'colsample_bytree': 0.9,
 'learning_rate': 0.1,
 'max_depth': 9,
 'n_estimators': 1200}

In [None]:
lgbm_model.best_score_

-0.5594135231862865

In [None]:
#training a metamodel with these 500 predictions using XGBRegressor
lgbm = LGBMRegressor(n_estimators=1200,max_depth=9,learning_rate=0.1,colsample_bytree=0.9,tree_method='gpu_hist',reg='regression',verbosity=2,silent=True,n_jobs=-1)

In [None]:
lgbm.fit(D2_pred, D2_y)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.9,
              importance_type='split', learning_rate=0.1, max_depth=9,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=1200, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg='regression', reg_alpha=0.0,
              reg_lambda=0.0, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0,
              tree_method='gpu_hist', verbosity=2)

In [None]:
#final prediction will be get by passing the test data to 500 base models  and creating a 400 predictions of new dataset and then pass it to our metamodel.
lgbm_pred_values = (lgbm.predict(test_pred))

## Training RMSLE

In [None]:
print('Train RMSLE = ',np.sqrt(mean_squared_error((D2_y) ,(lgbm.predict(D2_pred)))))

Train RMSLE =  0.25164114338143234


## Testing RMSLE

In [None]:
print('Test RMSLE = ',np.sqrt(mean_squared_error(test_y , lgbm_pred_values)))

Test RMSLE =  0.6775408288980387
