# **Imports**

In [1]:
%matplotlib inline
import json
import os
import pandas as pd
from sklearn import metrics
from sklearn.model_selection  import train_test_split
import numpy as np
import gc
import matplotlib.pyplot as plt

# **Connecting Drive to google colab**

In [2]:
# Mount google drive
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


# **Downoloading Dataset from kaggle**

In [3]:
!mkdir -p ~/.kaggle
!cp /gdrive/MyDrive/Kaggle/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
# Downloading Dataset
!kaggle competitions download -c ashrae-energy-prediction --force
#unzipping the zip files and deleting the zip files
!unzip \*.zip  && rm *.zip

Downloading weather_test.csv.zip to /content
  0% 0.00/2.53M [00:00<?, ?B/s]
100% 2.53M/2.53M [00:00<00:00, 84.8MB/s]
Downloading weather_train.csv.zip to /content
  0% 0.00/1.27M [00:00<?, ?B/s]
100% 1.27M/1.27M [00:00<00:00, 149MB/s]
Downloading test.csv.zip to /content
 98% 164M/167M [00:01<00:00, 103MB/s] 
100% 167M/167M [00:01<00:00, 96.6MB/s]
Downloading train.csv.zip to /content
 94% 113M/120M [00:02<00:00, 42.1MB/s] 
100% 120M/120M [00:02<00:00, 59.3MB/s]
Downloading building_metadata.csv to /content
  0% 0.00/44.5k [00:00<?, ?B/s]
100% 44.5k/44.5k [00:00<00:00, 40.7MB/s]
Downloading sample_submission.csv.zip to /content
 80% 71.0M/88.4M [00:00<00:00, 88.9MB/s]
100% 88.4M/88.4M [00:00<00:00, 101MB/s] 
Archive:  weather_train.csv.zip
  inflating: weather_train.csv       

Archive:  weather_test.csv.zip
  inflating: weather_test.csv        

Archive:  sample_submission.csv.zip
  inflating: sample_submission.csv   

Archive:  train.csv.zip
  inflating: train.csv               

Ar

# **Load Data**

In [4]:
#load the dataset
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')
weather_train = pd.read_csv('/content/weather_train.csv')
weather_test = pd.read_csv('/content/weather_test.csv')
building_meta = pd.read_csv('/content/building_metadata.csv')

In [5]:
## Function to reduce the DF size
def reduce_memory_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [6]:
reduce_memory_usage(building_meta)
reduce_memory_usage(weather_train)
reduce_memory_usage(train)
reduce_memory_usage(weather_test)
reduce_memory_usage(test)

Mem. usage decreased to  0.03 Mb (60.3% reduction)
Mem. usage decreased to  3.07 Mb (68.1% reduction)
Mem. usage decreased to 289.19 Mb (53.1% reduction)
Mem. usage decreased to  6.08 Mb (68.1% reduction)
Mem. usage decreased to 596.49 Mb (53.1% reduction)


Unnamed: 0,row_id,building_id,meter,timestamp
0,0,0,0,2017-01-01 00:00:00
1,1,1,0,2017-01-01 00:00:00
2,2,2,0,2017-01-01 00:00:00
3,3,3,0,2017-01-01 00:00:00
4,4,4,0,2017-01-01 00:00:00
...,...,...,...,...
41697595,41697595,1444,0,2018-05-09 07:00:00
41697596,41697596,1445,0,2018-05-09 07:00:00
41697597,41697597,1446,0,2018-05-09 07:00:00
41697598,41697598,1447,0,2018-05-09 07:00:00


In [7]:
# Merging tables
BuildingTrain = building_meta.merge(train, left_on='building_id', right_on='building_id' , how='left')
BuildingTest = building_meta.merge(test, left_on='building_id', right_on='building_id' , how='left')
train_merged=BuildingTrain.merge(weather_train,left_on=['site_id','timestamp'],right_on=['site_id','timestamp'],how='left')
test_merged = BuildingTest.merge(weather_test,left_on=['site_id','timestamp'],right_on=['site_id','timestamp'],how='left')
train_merged.shape

(20216100, 16)

In [8]:
del test
del train
del building_meta
del BuildingTest
del BuildingTrain
del weather_test
del weather_train
gc.collect()

61

# **Data Preprocessing**

In [9]:
# Remove unuseful columns as we see in the analysis year_built , floor count both have high percentage of missing data 

train_merged = train_merged.drop(columns=['year_built', 'floor_count', 'wind_direction', 'dew_temperature'])
test_merged = test_merged.drop(columns=['year_built', 'floor_count','wind_direction', 'dew_temperature'])

In [10]:
# Convert timestamp into month and a day which from analysis we found that they gave a good insights on meter reading

train_merged ['timestamp'] =  pd.to_datetime(train_merged['timestamp'])
test_merged ['timestamp'] =  pd.to_datetime(test_merged['timestamp'])
train_merged['Month']=pd.DatetimeIndex(train_merged['timestamp']).month
test_merged['Month']=pd.DatetimeIndex(test_merged['timestamp']).month
train_merged['Day']=pd.DatetimeIndex(train_merged['timestamp']).day
test_merged['Day']=pd.DatetimeIndex(test_merged['timestamp']).day

In [11]:
# Here we combine the values for different years and different hours a day so what we want here to convert dataframe into for each (meter,building_id,primary_use,month,day) to have value of (meter_reading,air_temperature,wind_speed,precip_depth_1_hr,cloud_coverage,square_feet)
train_merged= train_merged.groupby(['meter',train_merged['building_id'],'primary_use',train_merged['Month'], train_merged['Day']]).agg({'meter_reading':'sum', 'air_temperature': 'mean', 'wind_speed': 'mean', 'precip_depth_1_hr': 'mean', 'cloud_coverage': 'mean', 'square_feet': 'mean'})
train_merged = train_merged.reset_index()
test_merged_1= test_merged.groupby(['row_id','meter',test_merged['building_id'],'primary_use',test_merged['Month'], test_merged['Day']]).agg({ 'air_temperature': 'mean', 'wind_speed': 'mean', 'precip_depth_1_hr': 'mean', 'cloud_coverage': 'mean', 'square_feet': 'mean'})


In [12]:
# To fill the NA value, change the data type to float 32.

train_merged['wind_speed'] = train_merged['wind_speed'].astype('float32')
train_merged['air_temperature'] = train_merged['air_temperature'].astype('float32')
train_merged['precip_depth_1_hr'] = train_merged['precip_depth_1_hr'].astype('float32')
train_merged['cloud_coverage'] = train_merged['cloud_coverage'].astype('float32')
test_merged['wind_speed'] = test_merged['wind_speed'].astype('float32')
test_merged['air_temperature'] = test_merged['air_temperature'].astype('float32')
test_merged['precip_depth_1_hr'] = test_merged['precip_depth_1_hr'].astype('float32')
test_merged['cloud_coverage'] = test_merged['cloud_coverage'].astype('float32')

In [13]:
# Here the percentage of missing data is high so we use the forward and backward methods to introduce some variety in filling missing data
train_merged['precip_depth_1_hr'].fillna(method='ffill', inplace = True)
train_merged['cloud_coverage'].fillna(method='bfill', inplace = True)

# While here the percentage of missing data is low so we use the mean instead.
train_merged['wind_speed'].fillna(train_merged['wind_speed'].mean(), inplace=True)
train_merged['air_temperature'].fillna(train_merged['air_temperature'].mean(), inplace=True)

# Same for test data
test_merged['precip_depth_1_hr'].fillna(method='ffill', inplace = True)
test_merged['cloud_coverage'].fillna(method='bfill', inplace = True)
test_merged['precip_depth_1_hr'].fillna(test_merged['precip_depth_1_hr'].mean(), inplace=True)
test_merged['cloud_coverage'].fillna(test_merged['cloud_coverage'].mean(), inplace=True)

test_merged['wind_speed'].fillna(test_merged['wind_speed'].mean(), inplace=True)
test_merged['air_temperature'].fillna(test_merged['air_temperature'].mean(), inplace=True)

# Here we are checking that there is no null values
train_merged.isnull().sum()


meter                0
building_id          0
primary_use          0
Month                0
Day                  0
meter_reading        0
air_temperature      0
wind_speed           0
precip_depth_1_hr    0
cloud_coverage       0
square_feet          0
dtype: int64

In [14]:
# Here we are going to encode categorical variable which primary_use
# label encoding 
from sklearn.preprocessing import LabelEncoder

train_merged_encoded = train_merged[:]
test_merged_encoded = test_merged[:]

le = LabelEncoder()
train_merged_encoded["primary_use"] = le.fit_transform(train_merged_encoded["primary_use"])
test_merged_encoded["primary_use"] = le.fit_transform(test_merged_encoded["primary_use"])

In [15]:
# Features and labels in X, y

X = train_merged_encoded[['meter', 'building_id', 'primary_use', 'Month', 'Day','air_temperature', 'wind_speed', 'precip_depth_1_hr', 'cloud_coverage',
       'square_feet']]
y = train_merged_encoded['meter_reading']

In [16]:
# Split into training-validation-test
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state= 45)
x_train, x_val, y_train, y_val = train_test_split(x_train,y_train, test_size = 0.2, random_state= 45)


In [17]:
from keras import backend as K
from keras.callbacks import  EarlyStopping
from keras.layers import Dense, Dropout, BatchNormalization
from keras.models import Sequential
from keras.optimizers import RMSprop
from sklearn.model_selection  import train_test_split

In [18]:
def root_mean_squared_error(y_true, y_pred):
  return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [19]:
def build_model(input_dim=10,metrics=root_mean_squared_error,loss='mse', optimizer="rmsprop",drop_rate=0.5):

  model = Sequential()
  model.add(Dense(128, activation='relu',input_shape=(None,input_dim)))
  model.add(Dropout(drop_rate))
  model.add(BatchNormalization())
  model.add(Dense(256, activation='relu'))
  model.add(BatchNormalization())
  model.add(Dropout(drop_rate))
  model.add(Dense(256, activation='relu'))
  model.add(BatchNormalization())
  model.add(Dropout(drop_rate))
  model.add(Dense(256, activation='relu'))
  model.add(BatchNormalization())
  model.add(Dropout(drop_rate))
  model.add(Dense(1))
  model.compile(optimizer=optimizer, loss=loss, metrics=[metrics])
  
  return model

In [20]:
def train(model,x_train,y_train,epochs=50,batch_size=500,verbose=1,validation_data=(x_val,y_val),callbacks =None):
  x_train = x_train.values[:]
  x_train= x_train.reshape((x_train.shape[0],1,x_train.shape[-1]))
  y_train = np.log1p(y_train)
  if validation_data != None:
    x_val = validation_data[0].values[:]
    x_val = x_val.reshape((x_val.shape[0],1,x_val.shape[-1]))
    y_val = np.log1p(validation_data[-1])
      
  return model.fit(x_train,y_train,epochs=epochs,batch_size=batch_size,verbose=verbose,validation_data=(x_val,y_val),callbacks=callbacks)

In [21]:
es = EarlyStopping(monitor='val_root_mean_squared_error', min_delta=0.0001, patience=5, verbose=True, mode='auto')

In [22]:
model = build_model(input_dim=x_train.shape[-1],drop_rate=0.2)

In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, None, 128)         1408      
_________________________________________________________________
dropout (Dropout)            (None, None, 128)         0         
_________________________________________________________________
batch_normalization (BatchNo (None, None, 128)         512       
_________________________________________________________________
dense_1 (Dense)              (None, None, 256)         33024     
_________________________________________________________________
batch_normalization_1 (Batch (None, None, 256)         1024      
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 256)         0         
_________________________________________________________________
dense_2 (Dense)              (None, None, 256)         6

In [24]:
history = train(model,x_train,y_train,epochs=30,batch_size=500,verbose=1,validation_data=(x_val,y_val), callbacks =[es]) # callbacks =[mc, es]

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 00017: early stopping


In [25]:

#print(x_test)
x_test_inf = x_test.values[:]
#print(x_test_inf.shape[0],x_test_inf.shape[-1])
x_test_inf = x_test_inf.reshape((x_test_inf.shape[0],1,x_test_inf.shape[-1]))
#print(x_test_inf)
y_test_inf = np.log1p(y_test)

# Evaluate the model on the test data using `evaluate`
print("Evaluate on test data")
results = model.evaluate(x_test_inf, y_test_inf, batch_size=128)
print("test loss, test acc:", results)

# Generate predictions (probabilities -- the output of the last layer)
# on new data using `predict`
# print("Generate predictions for 3 samples")
# predictions = model.predict(x_test_inf)



Evaluate on test data
test loss, test acc: [6.036705493927002, 2.442908525466919]
