In [1]:
 #importing libraries
import pandas as pd
import numpy as np
from math import sqrt
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor,Ridge,ElasticNet
from tqdm import tqdm
import tensorflow as tf

In [2]:
#loading training data
data = pd.read_feather('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/train_feature_engineering.feather')
data.drop('index',axis=1,inplace=True)

### Drop the features which are not important

In [3]:
data.drop(['site_id','timestamp','wind_speed','wind_direction','is_summer_month','dew_temperature','relative_humidity','meter_reading','sea_level_pressure','cloud_coverage','precip_depth_1_hr','busy_hours','Sensible_Heat','discomfort_index','wind_chill','month'],axis=1,inplace=True)

In [4]:
Y_train = data['log_meter_reading'].values
X_train = data.drop(['log_meter_reading'],axis = 1)
Y_train = Y_train.astype(np.float32)

## Baseline Model

In [5]:
def baseline(y_actual,y_pred) :
    # Finding RMSE
    result = np.sqrt(np.mean((y_actual - y_pred)*(y_actual-y_pred)))
    print("The rmse score of baseline model is :",result)
baseline(Y_train,np.median(Y_train))

The rmse score of baseline model is : 2.0867717


### Splitting the data for training and validation

In [28]:
split_size = int((len(Y_train)*0.7))
x_train,x_test = X_train[0:split_size],X_train[split_size:]
y_train,y_test = Y_train[0:split_size],Y_train[split_size:]

## Function for calculating rmsle

In [29]:
def RMSLE(y_true:np.ndarray, y_pred:np.ndarray) -> np.float64:
    """
        The Root Mean Squared Log Error (RMSLE) metric 
        
        :param y_true: The ground truth labels given in the dataset
        :param y_pred: Our predictions
        :return: The RMSLE score
    """
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Linear Regression

In [35]:
lr = LinearRegression(fit_intercept=True,normalize=True,copy_X=True)
lr.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

### Calculation of rmsle score on predicting validation data using best parameters 

In [36]:
print('Train RMSLE = ',RMSLE((y_train) , (lr.predict(x_train))))
print('Test RMSLE = ',RMSLE((y_test) ,(lr.predict(x_test))))

Train RMSLE =  1.900923727664107
Test RMSLE =  1.9252230906482615


# ElasticNet

### Hyperparameter Tuning

In [7]:
parameters = {'alpha':[0.001,0.01,0.1,1,10,100,1000,10000],
              'fit_intercept' : [False],
              'l1_ratio':[0.5]}

elastic = GridSearchCV(estimator = ElasticNet(),
                        param_grid = parameters,
                        cv = 3, 
                        scoring = 'neg_mean_squared_error',
                        verbose = 1,
                        return_train_score = True,
                        n_jobs = -1)
elastic.fit(x_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed: 15.6min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True,
                                  l1_ratio=0.5, max_iter=1000, normalize=False,
                                  positive=False, precompute=False,
                                  random_state=None, selection='cyclic',
                                  tol=0.0001, warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],
                         'fit_intercept': [False], 'l1_ratio': [0.5]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='neg_mean_squared_error', verbose=1)

In [8]:
print("Best Estimator ",elastic.best_estimator_)

Best Estimator  ElasticNet(alpha=1, copy_X=True, fit_intercept=False, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)


### Fitting the model with the training data with best parameters

In [9]:
elastic_model = ElasticNet(l1_ratio=0.5,fit_intercept=False,alpha = 1)
elastic_model.fit(x_train, y_train)

ElasticNet(alpha=1, copy_X=True, fit_intercept=False, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

### Calculation of rmsle score on predicting training, validation data using best parameters 

In [11]:
preds = elastic_model.predict(x_test)
print('Train RMSLE = ',RMSLE((y_train) , (elastic_model.predict(x_train))))
print('Test RMSLE = ',RMSLE(y_test , preds))

Train RMSLE =  1.914597700808356
Test RMSLE =  1.9347195985270218


# Ridge

### Hyperparameter Tuning

In [31]:
parameters = {'alpha':[0.001,0.01,0.1,1,10,100,1000,10000],
              'fit_intercept' : [True],
              'solver' : ['lsqr']}

ridge = GridSearchCV(estimator = Ridge(),
                        param_grid = parameters,
                        cv = 3, 
                        scoring = 'neg_mean_squared_error',
                        verbose = 10,
                        return_train_score = True,
                        n_jobs = -1)
ridge.fit(x_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   18.4s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   29.7s
[Parallel(n_jobs=-1)]: Done  20 out of  24 | elapsed:   30.8s remaining:    6.2s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   36.1s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=None, normalize=False, random_state=None,
                             solver='auto', tol=0.001),
             iid='deprecated', n_jobs=-1,
             param_grid={'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],
                         'fit_intercept': [True], 'solver': ['lsqr']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='neg_mean_squared_error', verbose=10)

In [32]:
print("Best Estimator ",ridge.best_estimator_)

Best Estimator  Ridge(alpha=10000, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='lsqr', tol=0.001)


### Fitting the model with the training data with best parameters

In [33]:
ridge_model = Ridge(solver = "lsqr", fit_intercept=True,alpha = 10000)
ridge_model.fit(x_train, y_train)

Ridge(alpha=10000, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='lsqr', tol=0.001)

### Calculation of rmsle score on predicting training, validation data using best parameters 

In [34]:
preds = ridge_model.predict(x_test)
print('Train RMSLE = ',RMSLE((y_train) , (ridge_model.predict(x_train))))
print('Test RMSLE = ',RMSLE(y_test , preds))

Train RMSLE =  1.9212876509080787
Test RMSLE =  1.9279760013248401


In [37]:
from prettytable import PrettyTable
  
# Specify the Column Names while initializing the Table
myTable = PrettyTable(["Model","Train RMSLE","Test RMSLE"])
  
# Add rows
myTable.add_row(["LinearRegression", "1.9", "1.925"])
myTable.add_row(["ElasticNet", "1.914", "1.934"])
myTable.add_row(["RidgeRegressor", "1.921", "1.927"])
print(myTable)

+------------------+-------------+------------+
|      Model       | Train RMSLE | Test RMSLE |
+------------------+-------------+------------+
| LinearRegression |     1.9     |   1.925    |
|    ElasticNet    |    1.914    |   1.934    |
|  RidgeRegressor  |    1.921    |   1.927    |
+------------------+-------------+------------+


*  LinearRegression showing better results than ElasticNet and Ridge

In [13]:
#Loading test data
df = pd.read_feather('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/test_data_final.feather')
df.drop('index',axis=1,inplace=True)

### Drop the unimportant features

In [14]:
df.drop(['row_id','site_id','timestamp','wind_speed','wind_direction','is_summer_month','dew_temperature','relative_humidity','sea_level_pressure','cloud_coverage','precip_depth_1_hr','busy_hours','Sensible_Heat','discomfort_index','wind_chill','month'],axis=1,inplace=True)

In [16]:
lr = LinearRegression(fit_intercept=True,normalize=True,copy_X=True)

In [17]:
lr.fit(X_train,Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [18]:
y_test=lr.predict(df)

In [21]:
y_test=np.expm1(y_test)

In [22]:
test = np.round(y_test,4)

In [23]:
test_df = pd.DataFrame(data=test,columns={'meter_reading'})
test_df['row_id'] = test_df.index
test_df = test_df[['row_id','meter_reading']]
test_df.head()

Unnamed: 0,row_id,meter_reading
0,0,32.9184
1,1,30.5865
2,2,27.1286
3,3,34.4685
4,4,48.0765


### Meter_readings which are less than zero are set to zero because meter readings can not be than zero

In [24]:
for i in test_df[test_df['meter_reading']<0].index: 
    
    test_df['meter_reading'][i] = 0

In [25]:
test_df.to_csv('/content/drive/MyDrive/ashrae_Great_Energy_Prediction/linearmodels.csv',index=False,header=True)