In [1]:
directory = 'nn_leak/'
filename = 'nn_leak'

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import datetime
import gc
from meteocalc import Temp, dew_point, heat_index, wind_chill, feels_like
# Any results you write to the current directory are saved as output.
from sklearn.metrics import mean_squared_error
from keras.models import Model, load_model
from keras.layers import Input, Dropout, Dense, Embedding, SpatialDropout1D, concatenate, BatchNormalization, Flatten, PReLU, CuDNNLSTM
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras import backend as K
from keras.models import Model
from keras.losses import mean_squared_error as mse_loss

from keras import optimizers
from keras.optimizers import RMSprop, Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
def fill_weather_dataset(weather):

    zone_dict={0:4,1:0,2:7,3:4,4:7,5:0,6:4,7:4,8:4,9:5,10:7,11:4,12:0,13:5,14:4,15:4} 

    def set_localtime(df):
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        for sid, zone in zone_dict.items():
            sids = df.site_id == sid
            df.loc[sids, 'timestamp'] = df[sids].timestamp - pd.offsets.Hour(zone)
        df['timestamp'] = df['timestamp'].dt.strftime("%Y-%m-%d %H:%M:%S")
    
    set_localtime(weather)
    
    # Find Missing Dates
    time_format = "%Y-%m-%d %H:%M:%S"
    start_date = datetime.datetime.strptime(weather['timestamp'].min(),time_format)
    end_date = datetime.datetime.strptime(weather['timestamp'].max(),time_format)
    total_hours = int(((end_date - start_date).total_seconds() + 3600) / 3600)
    hours_list = [(end_date - datetime.timedelta(hours=x)).strftime(time_format) for x in range(total_hours)]

    missing_hours = []
    for site_id in range(16):
        site_hours = np.array(weather[weather['site_id'] == site_id]['timestamp'])
        new_rows = pd.DataFrame(np.setdiff1d(hours_list,site_hours),columns=['timestamp'])
        new_rows['site_id'] = site_id
        weather = pd.concat([weather,new_rows])

        weather = weather.reset_index(drop=True)           

    # Add new Features
    weather["datetime"] = pd.to_datetime(weather["timestamp"])
    weather["day"] = weather["datetime"].dt.day
    weather["week"] = weather["datetime"].dt.week
    weather["month"] = weather["datetime"].dt.month
    
    # Reset Index for Fast Update
    weather = weather.set_index(['site_id','day','month'])

    air_temperature_filler = pd.DataFrame(weather.groupby(['site_id','day','month'])['air_temperature'].mean(),columns=["air_temperature"])
    weather.update(air_temperature_filler,overwrite=False)

    cloud_coverage_filler = weather.groupby(['site_id','day','month'])['cloud_coverage'].mean()
    cloud_coverage_filler = pd.DataFrame(cloud_coverage_filler.fillna(method='ffill'),columns=["cloud_coverage"])
    weather.update(cloud_coverage_filler,overwrite=False)

    due_temperature_filler = pd.DataFrame(weather.groupby(['site_id','day','month'])['dew_temperature'].mean(),columns=["dew_temperature"])
    weather.update(due_temperature_filler,overwrite=False)

    sea_level_filler = weather.groupby(['site_id','day','month'])['sea_level_pressure'].mean()
    sea_level_filler = pd.DataFrame(sea_level_filler.fillna(method='ffill'),columns=['sea_level_pressure'])
    weather.update(sea_level_filler,overwrite=False)

    wind_direction_filler =  pd.DataFrame(weather.groupby(['site_id','day','month'])['wind_direction'].mean(),columns=['wind_direction'])
    weather.update(wind_direction_filler,overwrite=False)
    
    wind_speed_filler =  pd.DataFrame(weather.groupby(['site_id','day','month'])['wind_speed'].mean(),columns=['wind_speed'])
    weather.update(wind_speed_filler,overwrite=False)

    precip_depth_filler = weather.groupby(['site_id','day','month'])['precip_depth_1_hr'].mean()
    precip_depth_filler = pd.DataFrame(precip_depth_filler.fillna(method='ffill'),columns=['precip_depth_1_hr'])
    weather.update(precip_depth_filler,overwrite=False)

    weather = weather.reset_index()
    weather = weather.drop(['datetime','day','week','month'],axis=1)
    
    weather.loc[weather['dew_temperature'] > weather['air_temperature'], 'dew_temperature'] = weather['air_temperature']
    weather['humidity'] = 100*(np.exp((17.625*weather['dew_temperature'])/(243.04+weather['dew_temperature']))/np.exp((17.625*weather['air_temperature'])/(243.04+weather['air_temperature'])))
    
    weather['heat_index'] = weather.apply(lambda x: heat_index(temperature=x['air_temperature'], humidity=x['humidity']), axis=1)
    weather['heat_index'] = weather['heat_index'].apply(lambda x: round(x, 2))
    weather['feels_like'] = weather.apply(lambda x: feels_like(temperature=x['air_temperature'], humidity=x['humidity'], wind_speed=x['wind_speed']), axis=1)
    weather['feels_like'] = weather['feels_like'].apply(lambda x: round(x, 2))
    
    return weather

weather_train = pd.read_csv("../weather_train.csv")
weather_train = fill_weather_dataset(weather_train)

In [4]:
def create_lag_features(df, window):
    df['ave_temp'] = df['air_temperature']/2 + df['dew_temperature']/2
    
    feature_cols = ["air_temperature", "dew_temperature", "cloud_coverage", "precip_depth_1_hr","ave_temp",
                    "humidity",'heat_index']
    df_site = df.groupby("site_id")

    df_rolled = df_site[feature_cols].rolling(window=window, min_periods=0)

    df_mean = df_rolled.mean().reset_index().astype(np.float16)
    df_min = df_rolled.min().reset_index().astype(np.float16)
    df_std = df_rolled.std().reset_index().astype(np.float16)

    for feature in feature_cols:
        df[f"{feature}_mean_lag{window}"] = df_mean[feature]
        df[f"{feature}_min_lag{window}"] = df_min[feature]
        df[f"{feature}_std_lag{window}"] = df_std[feature]
    
    if window == 6:
        for feature in feature_cols:
            df[f"{feature}_diff1"] = df[feature].diff(1)
        
    return df

weather_train = create_lag_features(weather_train, 6)

In [5]:
weather_test = pd.read_csv('../weather_test.csv')
weather_test = fill_weather_dataset(weather_test)
weather_test = create_lag_features(weather_test, 6)
weather = pd.concat([weather_train, weather_test])

In [6]:
def set_holiday2(dataframe):
    # site 0
    date1 = pd.date_range(start='1/1/2016',end='1/05/2016')
    date2 = pd.date_range(start='3/21/2016', end='3/27/2016')
    date3 = pd.date_range(start='11/24/2016', end='11/25/2016')
    date4 = pd.date_range(start='12/19/2016',end='12/31/2016')
    date5 = pd.date_range(start='1/1/2017', end='1/08/2017')
    date6 = pd.date_range(start='3/13/2017', end='3/17/2017')
    date7 = pd.date_range(start='11/23/2017', end='11/24/2017')
    date8 = pd.date_range(start='12/19/2017',end='12/31/2017')
    date9 = pd.date_range(start='1/1/2018', end='1/07/2018')
    date10 = pd.date_range(start='3/12/2018', end='3/18/2018')
    date11 = pd.date_range(start='11/22/2018', end='11/23/2018')
    date12 = pd.date_range(start='12/19/2018',end='12/31/2018')
    site0_hol = date1.union(date2).union(date3).union(date4).union(date5).union(date6).union(date7).union(date8).union(date9).union(date10).union(date11).union(date12)
    sids = dataframe.site_id == 0
    dataframe.loc[(sids) & (dataframe.timestamp.isin((site0_hol))), 'is_holiday'] = 1
    # site 1
    date1 = pd.date_range(start='1/1/2016',end='1/03/2016')
    date2 = pd.date_range(start='3/25/2016', end='3/30/2016')
    date3 = pd.date_range(start='12/24/2016',end='12/31/2016')
    date4 = pd.date_range(start='1/1/2017', end='1/02/2017')
    date5 = pd.date_range(start='3/13/2017', end='3/19/2017')
    date6 = pd.date_range(start='12/23/2017',end='12/31/2017')
    date7 = pd.date_range(start='3/29/2018', end='4/04/2018')
    date8 = pd.date_range(start='12/22/2018',end='12/31/2018')
    site1_hol = date1.union(date2).union(date3).union(date4).union(date5).union(date6).union(date7).union(date8)
    sids = dataframe.site_id == 1
    dataframe.loc[(sids) & (dataframe.timestamp.isin((site1_hol))), 'is_holiday'] = 1
    # site 2
    date1 = pd.date_range(start='1/1/2016',end='1/12/2016')
    date2 = pd.date_range(start='3/12/2016', end='3/20/2016')
    date3 = pd.date_range(start='11/24/2016', end='11/27/2016')
    date4 = pd.date_range(start='12/26/2016',end='12/27/2016')
    date5 = pd.date_range(start='1/1/2017', end='1/9/2017')
    date6 = pd.date_range(start='3/11/2017', end='3/19/2017')
    date7 = pd.date_range(start='11/23/2017', end='11/26/2017')
    date8 = pd.date_range(start='12/25/2017',end='12/26/2017')
    date9 = pd.date_range(start='1/1/2018', end='1/09/2018')
    date10 = pd.date_range(start='3/5/2018', end='3/9/2018')
    date11 = pd.date_range(start='11/22/2018', end='11/25/2018')
    date12 = pd.date_range(start='12/24/2018',end='12/25/2018')
    site2_hol = date1.union(date2).union(date3).union(date4).union(date5).union(date6).union(date7).union(date8).union(date9).union(date10).union(date11).union(date12)
    sids = dataframe.site_id == 2
    dataframe.loc[(sids) & (dataframe.timestamp.isin((site2_hol))), 'is_holiday'] = 1
    # site 4
    date1 = pd.date_range(start='1/1/2016',end='1/11/2016')
    date2 = pd.date_range(start='3/21/2016', end='3/25/2016')
    date3 = pd.date_range(start='11/24/2016', end='11/25/2016')
    date4 = pd.date_range(start='12/26/2016',end='12/27/2016')
    date5 = pd.date_range(start='1/1/2017', end='1/9/2017')
    date6 = pd.date_range(start='3/27/2017', end='3/31/2017')
    date7 = pd.date_range(start='11/23/2017', end='11/26/2017')
    date8 = pd.date_range(start='12/25/2017',end='12/26/2017')
    date9 = pd.date_range(start='12/29/2017',end='12/29/2017')
    date10 = pd.date_range(start='1/1/2018', end='1/08/2018')
    date11 = pd.date_range(start='3/26/2018', end='3/30/2018')
    date12 = pd.date_range(start='11/22/2018', end='11/23/2018')
    date13 = pd.date_range(start='12/24/2018',end='12/25/2018')
    date14 = pd.date_range(start='12/31/2018',end='12/31/2018')
    site4_hol = date1.union(date2).union(date3).union(date4).union(date5).union(date6).union(date7).union(date8).union(date9).union(date10).union(date11).union(date12).union(date13).union(date14)
    sids = dataframe.site_id == 4
    dataframe.loc[(sids) & (dataframe.timestamp.isin((site4_hol))), 'is_holiday'] = 1
    # site 5
    date1 = pd.date_range(start='1/1/2016',end='1/3/2016')
    date2 = pd.date_range(start='3/19/2016', end='4/17/2016')
    date3 = pd.date_range(start='6/11/2016', end='8/21/2016')
    date4 = pd.date_range(start='1/1/2017',end='1/8/2017')
    date5 = pd.date_range(start='3/23/2017', end='4/23/2017')
    date6 = pd.date_range(start='6/17/2017', end='8/20/2017')
    date7 = pd.date_range(start='1/1/2018',end='1/7/2018')
    date8 = pd.date_range(start='3/17/2018', end='4/15/2018')
    date9 = pd.date_range(start='6/16/2018', end='8/19/2018')
    site5_hol = date1.union(date2).union(date3).union(date4).union(date5).union(date6).union(date7).union(date8).union(date9)
    sids = dataframe.site_id == 5
    dataframe.loc[(sids) & (dataframe.timestamp.isin((site5_hol))), 'is_holiday'] = 1
    # site 7&11
    date1 = pd.date_range(start='1/1/2016',end='1/06/2016')
    date2 = pd.date_range(start='2/29/2016', end='3/4/2016')
    date3 = pd.date_range(start='11/24/2016', end='11/25/2016')
    date4 = pd.date_range(start='12/23/2016',end='12/31/2016')
    date5 = pd.date_range(start='1/1/2017', end='1/03/2017')
    date6 = pd.date_range(start='2/27/2017', end='3/3/2017')
    date7 = pd.date_range(start='11/23/2017', end='11/24/2017')
    date8 = pd.date_range(start='12/23/2017',end='12/31/2017')
    date9 = pd.date_range(start='1/1/2018', end='1/07/2018')
    date10 = pd.date_range(start='3/5/2018', end='3/9/2018')
    date11 = pd.date_range(start='11/22/2018', end='11/23/2018')
    date12 = pd.date_range(start='12/24/2018',end='12/31/2018')
    site_hol = date1.union(date2).union(date3).union(date4).union(date5).union(date6).union(date7).union(date8).union(date9).union(date10).union(date11).union(date12)
    sids = dataframe.site_id == 7
    dataframe.loc[(sids) & (dataframe.timestamp.isin((site_hol))), 'is_holiday'] = 1
    sids = dataframe.site_id == 11
    dataframe.loc[(sids) & (dataframe.timestamp.isin((site_hol))), 'is_holiday'] = 1
    # site 9
    date1 = pd.date_range(start='1/1/2016',end='1/06/2016')
    date2 = pd.date_range(start='3/14/2016', end='3/19/2016')
    date3 = pd.date_range(start='11/23/2016', end='11/26/2016')
    date4 = pd.date_range(start='12/23/2016',end='12/31/2016')
    date5 = pd.date_range(start='1/1/2017', end='1/04/2017')
    date6 = pd.date_range(start='3/13/2017', end='3/18/2017')
    date7 = pd.date_range(start='11/22/2017', end='11/25/2017')
    date8 = pd.date_range(start='12/23/2017',end='12/31/2017')
    date9 = pd.date_range(start='1/1/2018', end='1/03/2018')
    date10 = pd.date_range(start='3/12/2018', end='3/17/2018')
    date11 = pd.date_range(start='11/21/2018', end='11/24/2018')
    date12 = pd.date_range(start='12/24/2018',end='12/31/2018')
    site9_hol = date1.union(date2).union(date3).union(date4).union(date5).union(date6).union(date7).union(date8).union(date9).union(date10).union(date11).union(date12)
    sids = dataframe.site_id == 9
    dataframe.loc[(sids) & (dataframe.timestamp.isin((site9_hol))), 'is_holiday'] = 1
    # site 10
    date1 = pd.date_range(start='1/1/2016',end='1/10/2016')
    date2 = pd.date_range(start='3/5/2016', end='3/13/2016')
    date3 = pd.date_range(start='11/24/2016', end='11/25/2016')
    date4 = pd.date_range(start='12/23/2016',end='12/31/2016')
    date5 = pd.date_range(start='1/1/2017', end='1/08/2017')
    date6 = pd.date_range(start='3/4/2017', end='3/12/2017')
    date7 = pd.date_range(start='11/23/2017', end='11/24/2017')
    date8 = pd.date_range(start='12/24/2017',end='12/31/2017')
    date9 = pd.date_range(start='1/1/2018', end='1/07/2018')
    date10 = pd.date_range(start='3/3/2018', end='3/11/2018')
    date11 = pd.date_range(start='11/22/2018', end='11/23/2018')
    date12 = pd.date_range(start='12/24/2018',end='12/31/2018')
    site10_hol = date1.union(date2).union(date3).union(date4).union(date5).union(date6).union(date7).union(date8).union(date9).union(date10).union(date11).union(date12)
    sids = dataframe.site_id == 10
    dataframe.loc[(sids) & (dataframe.timestamp.isin((site10_hol))), 'is_holiday'] = 1
    # site 13
    date1 = pd.date_range(start='1/1/2016',end='1/12/2016')
    date2 = pd.date_range(start='3/7/2016', end='3/11/2016')
    date3 = pd.date_range(start='10/27/2016', end='10/28/2016')
    date4 = pd.date_range(start='11/24/2016', end='11/25/2016')
    date5 = pd.date_range(start='12/23/2016',end='12/30/2016')
    date6 = pd.date_range(start='1/1/2017',end='1/10/2017')
    date7 = pd.date_range(start='3/6/2017', end='3/10/2017')
    date8 = pd.date_range(start='10/26/2017', end='10/27/2017')
    date9 = pd.date_range(start='11/23/2017', end='11/24/2017')
    date10 = pd.date_range(start='12/25/2017',end='12/29/2017')
    date11 = pd.date_range(start='1/1/2018',end='1/9/2018')
    date12 = pd.date_range(start='3/5/2018', end='3/9/2018')
    date13 = pd.date_range(start='10/25/2018', end='10/26/2018')
    date14 = pd.date_range(start='11/22/2018', end='11/23/2018')
    date15 = pd.date_range(start='12/25/2018',end='12/26/2018')
    date16 = pd.date_range(start='12/31/2018',end='12/31/2018')
    site13_hol = date1.union(date2).union(date3).union(date4).union(date5).union(date6).union(date7).union(date8).union(date9).union(date10).union(date11).union(date12).union(date13).union(date14).union(date15).union(date16)
    sids = dataframe.site_id == 13
    dataframe.loc[(sids) & (dataframe.timestamp.isin((site13_hol))), 'is_holiday'] = 1
    # site 14
    date1 = pd.date_range(start='1/1/2016',end='1/03/2016')
    date2 = pd.date_range(start='3/5/2016', end='3/13/2016')
    date3 = pd.date_range(start='11/23/2016', end='11/27/2016')
    date4 = pd.date_range(start='1/1/2017', end='1/02/2017')
    date5 = pd.date_range(start='3/4/2017', end='3/12/2017')
    date6 = pd.date_range(start='11/22/2017', end='11/26/2017')
    date7 = pd.date_range(start='1/1/2018', end='1/02/2018')
    date8 = pd.date_range(start='3/3/2018', end='3/11/2018')
    date9 = pd.date_range(start='11/21/2018', end='11/25/2018')
    site14_hol = date1.union(date2).union(date3).union(date4).union(date5).union(date6).union(date7).union(date8).union(date9)
    sids = dataframe.site_id == 14
    dataframe.loc[(sids) & (dataframe.timestamp.isin((site14_hol))), 'is_holiday'] = 1
    # site 15
    date1 = pd.date_range(start='1/1/2016',end='1/03/2016')
    date2 = pd.date_range(start='2/13/2016',end='2/17/2016')
    date3 = pd.date_range(start='3/26/2016', end='4/3/2016')
    date4 = pd.date_range(start='10/8/2016', end='10/11/2016')
    date5 = pd.date_range(start='11/23/2016', end='11/27/2016')
    date6 = pd.date_range(start='12/26/2016',end='12/31/2016')
    date7 = pd.date_range(start='1/1/2017',end='1/02/2017')
    date8 = pd.date_range(start='2/18/2017',end='2/21/2017')
    date9 = pd.date_range(start='4/1/2017', end='4/9/2017')
    date10 = pd.date_range(start='10/7/2017', end='10/10/2017')
    date11 = pd.date_range(start='11/22/2017', end='11/26/2017')
    date12 = pd.date_range(start='12/25/2017',end='12/31/2017')
    date13 = pd.date_range(start='2/17/2018',end='2/20/2018')
    date14 = pd.date_range(start='3/31/2018', end='4/8/2018')
    date15 = pd.date_range(start='10/6/2018', end='10/9/2018')
    date16 = pd.date_range(start='11/21/2018', end='11/25/2018')
    date17 = pd.date_range(start='12/24/2018',end='12/31/2018')
    site15_hol = date1.union(date2).union(date3).union(date4).union(date5).union(date6).union(date7).union(date8).union(date9).union(date10).union(date11).union(date12).union(date13).union(date14).union(date15).union(date16).union(date17)
    sids = dataframe.site_id == 15
    dataframe.loc[(sids) & (dataframe.timestamp.isin((site15_hol))), 'is_holiday'] = 1
    return dataframe

In [7]:
def reduce_mem_usage(df):
    start_mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in df.columns:
        if df[col].dtype != object:  # Exclude strings            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",df[col].dtype)            
            # make variables for Int, max and min
            IsInt = False
            mx = df[col].max()
            mn = df[col].min()
            print("min for this col: ",mn)
            print("max for this col: ",mx)
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(df[col]).all(): 
                NAlist.append(col)
                df[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = df[col].fillna(0).astype(np.int64)
            result = (df[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        df[col] = df[col].astype(np.uint8)
                    elif mx < 65535:
                        df[col] = df[col].astype(np.uint16)
                    elif mx < 4294967295:
                        df[col] = df[col].astype(np.uint32)
                    else:
                        df[col] = df[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)    
            # Make float datatypes 32 bit
            else:
                df[col] = df[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",df[col].dtype)
            print("******************************")
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return df, NAlist

In [8]:
leak_df = pd.read_feather('leak.feather')
leak_df.fillna(0, inplace=True)
leak_df = leak_df[(leak_df.timestamp.dt.year > 2016) & (leak_df.timestamp.dt.year < 2019)]
leak_df.loc[leak_df.meter_reading < 0, 'meter_reading'] = 0 # remove large negative values
leak_df = leak_df[leak_df.building_id!=245]
leak_df['timestamp'] = leak_df['timestamp'].astype(str)

In [9]:
building_df = pd.read_csv("../building_metadata.csv")
train = pd.read_csv("../train.csv")

bad_rows = pd.read_csv('../rows_to_drop.csv')
train = train.drop(index=bad_rows['0'].values)
del bad_rows

train = pd.concat([train, leak_df]).reset_index(drop = True)
train = train.merge(building_df, left_on = "building_id", right_on = "building_id", how = "left")
train = train.merge(weather, left_on = ["site_id", "timestamp"], right_on = ["site_id", "timestamp"])
del weather, weather_train, weather_test, leak_df

train["timestamp"] = pd.to_datetime(train["timestamp"])
train["hour"] = train["timestamp"].dt.hour
train["weekday"] = train["timestamp"].dt.weekday

In [10]:
us_holidays = ["2016-01-01", "2016-01-18", "2016-02-15", "2016-05-30", "2016-07-04",
              "2016-09-05", "2016-10-10", "2016-11-11", "2016-11-24", "2016-12-26",
              "2017-01-02", "2017-01-16", "2017-02-20", "2017-05-29", "2017-07-04",
              "2017-09-04", "2017-10-09", "2017-11-10", "2017-11-23", "2017-12-25",
              "2018-01-01", "2018-01-15", "2018-02-19", "2018-05-28", "2018-07-04",
              "2018-09-03", "2018-10-08", "2018-11-12", "2018-11-22", "2018-12-25",
              "2019-01-01"]
uk_holidays = ["2016-01-01", "2016-01-02", "2016-03-25", "2016-03-28", "2016-05-02", 
               "2016-05-30", "2016-08-29", "2016-12-25", "2016-12-26", "2016-12-27", 
               "2017-01-01", "2017-01-02", "2017-04-14", "2017-04-17", "2017-05-01", 
               "2017-05-29", "2017-08-28", "2017-12-25", "2017-12-26", 
               "2018-01-01", "2018-01-02", "2018-03-30", "2018-04-02", "2018-05-07", 
               "2018-05-28", "2018-08-27", "2018-12-25", "2018-12-26",
               "2019-01-01"]
ir_holidays = ["2016-01-01", "2016-01-02", "2016-03-17", "2016-03-28", "2016-05-02", 
               "2016-06-02", "2016-08-01", "2016-10-31", "2016-12-26", "2016-12-27", 
               "2017-01-01", "2017-01-02", "2017-03-17", "2017-04-14", "2017-05-01", 
               "2017-06-05", "2017-08-07", "2017-10-30", "2017-12-25", "2017-12-26", 
               "2018-01-01", "2018-01-02", "2018-03-19", "2018-04-02", "2018-05-07", 
               "2018-06-04", "2018-08-06", "2018-10-29", "2018-12-25", "2018-12-26",
               "2019-01-01"]

def set_holiday(train):
    train["is_holiday"] = 0
    train.loc[train.weekday.isin([5,6]), 'is_holiday'] = 1
    
    us_zone = [0,2,3,4,6,7,8,9,10,11,13,14,15]
    for sid in us_zone:
        sids = train.site_id == sid
        train.loc[(sids) & (train.timestamp.isin(us_holidays)), 'is_holiday'] = 1
    uk_zone = [1,5]
    for sid in uk_zone:
        sids = train.site_id == sid
        train.loc[(sids) & (train.timestamp.isin(uk_holidays)), 'is_holiday'] = 1
    sids = train.site_id == 12
    train.loc[(sids) & (train.timestamp.isin(ir_holidays)), 'is_holiday'] = 1
    return train

train = set_holiday(train)
train = set_holiday2(train)

train['month'] = train['timestamp'].dt.month
train['month'].replace((1, 2, 3, 12), 1, inplace = True)
train['month'].replace((6, 7, 8, 9), 2, inplace = True)
train['month'].replace((10, 11, 4, 5), 3, inplace = True)
del train["timestamp"]

In [11]:
### new features
train["puse_hour"] = train["primary_use"].astype(str) + '_' + train["hour"].astype(str)
train["holiday_hour"] = train["is_holiday"].astype(str) + '_' + train["hour"].astype(str)

In [12]:
def generate_categorical_encoders(df, features):
    encoders = {}
    for feature in features:
        df[feature] = df[feature].fillna('missing')
        encoder = LabelEncoder()
        encoder.fit(df[feature].values)
        le_dict = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
        encoders[feature] = le_dict
    return encoders

def encode_categorical_features(df, features, encoders):
    for f in features:
        df[f] = df[f].fillna('missing')
        df[f] = df[f].map(encoders[f])

encoded_features = ["primary_use","puse_hour","holiday_hour"]       
encoders = generate_categorical_encoders(train, encoded_features)
encode_categorical_features(train, encoded_features, encoders)

In [13]:
target = np.log1p(train["meter_reading"])
del train["meter_reading"] 

drop_cols = ["sea_level_pressure", "wind_speed", "wind_direction"]
train = train.drop(drop_cols, axis = 1)

In [14]:
categoricals = ["site_id","building_id","primary_use","hour", "weekday", "meter","puse_hour","holiday_hour"]

numericals = [i for i in train.columns if i not in categoricals]
feat_cols = categoricals + numericals

In [15]:
train, NAlist = reduce_mem_usage(train)

Memory usage of properties dataframe is : 8180.440246582031  MB
******************************
Column:  building_id
dtype before:  int64
min for this col:  0
max for this col:  1448
dtype after:  uint16
******************************
******************************
Column:  meter
dtype before:  float64
min for this col:  0.0
max for this col:  3.0
dtype after:  uint8
******************************
******************************
Column:  site_id
dtype before:  int64
min for this col:  0
max for this col:  15
dtype after:  uint8
******************************
******************************
Column:  primary_use
dtype before:  int64
min for this col:  0
max for this col:  15
dtype after:  uint8
******************************
******************************
Column:  square_feet
dtype before:  int64
min for this col:  283
max for this col:  875000
dtype after:  uint32
******************************
******************************
Column:  year_built
dtype before:  float64
min for this col:  190

dtype after:  uint8
******************************
******************************
Column:  is_holiday
dtype before:  int64
min for this col:  0
max for this col:  1
dtype after:  uint8
******************************
******************************
Column:  month
dtype before:  int64
min for this col:  1
max for this col:  3
dtype after:  uint8
******************************
******************************
Column:  puse_hour
dtype before:  int64
min for this col:  0
max for this col:  383
dtype after:  uint16
******************************
******************************
Column:  holiday_hour
dtype before:  int64
min for this col:  0
max for this col:  47
dtype after:  uint8
******************************
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  5105.311248779297  MB
This is  62.40875912408759 % of the initial size


In [16]:
def get_model():

    #Inputs
    site_id = Input(shape=[1], name="site_id")
    building_id = Input(shape=[1], name="building_id")
    meter = Input(shape=[1], name="meter")
    primary_use = Input(shape=[1], name="primary_use")
    hour = Input(shape=[1], name="hour")
    weekday = Input(shape=[1], name="weekday")
    puse_hour = Input(shape=[1], name="puse_hour")
    holiday_hour = Input(shape=[1], name="holiday_hour")
    num_input = Input(shape=(len(numericals),), name="num_input")
    
   
    #Embeddings layers
    emb_site_id = Embedding(16, 6)(site_id)
    emb_building_id = Embedding(1449, 128)(building_id)
    emb_meter = Embedding(4, 2)(meter)
    emb_primary_use = Embedding(16, 6)(primary_use)
    emb_hour = Embedding(24, 8)(hour)
    emb_weekday = Embedding(7, 2)(weekday)
    emb_puse_hour = Embedding(384, 16)(puse_hour)
    emb_holiday_hour = Embedding(48, 12)(holiday_hour)
    categ = concatenate([Flatten() (emb_site_id), Flatten() (emb_building_id), Flatten() (emb_meter), 
                         Flatten() (emb_primary_use), Flatten() (emb_hour), Flatten() (emb_weekday), 
                         Flatten() (emb_puse_hour), Flatten() (emb_holiday_hour)])
    
     ### Categorical
    n_unit = 128
    decay_rate = 0.5
    for k in range(3):
        categ = Dense(n_unit)(categ)
        categ = PReLU()(categ)
        categ = BatchNormalization()(categ)
        categ = Dropout(0.1)(categ)     
        n_unit = int(n_unit * decay_rate)
    
    ### Dense
    numerical = Dense(64)(num_input)
    numerical = PReLU()(numerical)
    numerical = BatchNormalization()(numerical)
    numerical = Dropout(0.1)(numerical)
    numerical = Dense(64)(numerical)
    numerical = PReLU()(numerical)
    numerical = BatchNormalization()(numerical)
    numerical = Dropout(0.1)(numerical)
    numerical = Dense(32)(numerical)
    numerical = PReLU()(numerical)
    numerical = BatchNormalization()(numerical)
    numerical = Dropout(0.1)(numerical)  
    
    #main layer
    x = concatenate([categ, numerical])
    n_unit = 128
    decay_rate = 0.5
    for k in range(4):
        x = Dense(n_unit)(x)
        x = PReLU()(x)
        x = BatchNormalization()(x)
        x = Dropout(0.1)(x)     
        n_unit = int(n_unit * decay_rate)

    #output
    output = Dense(1)(x)

    model = Model([site_id,building_id,meter,primary_use,hour,weekday,puse_hour,holiday_hour,
                   num_input], output)

    model.compile(optimizer = Adam(lr=0.001, decay = 0.0001),
                  loss= mse_loss,
                  metrics=[root_mean_squared_error])
    return model

def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true), axis=0))

In [17]:
def get_keras_data(df, num_cols, cat_cols):
    X = {col: np.array(df[col]) for col in cat_cols}
    X['num_input'] = df[num_cols].values
    return X

def train_model(keras_model, X_t, y_train, batch_size, epochs, X_v, y_valid, fold, patience=5):
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    early_stopping = EarlyStopping(patience=patience, verbose=1)
    model_checkpoint = ModelCheckpoint(directory+"model_" + str(fold) + ".hdf5",
                                       save_best_only=True, verbose=1, monitor='val_root_mean_squared_error', mode='min')

    hist = keras_model.fit(X_t, y_train, batch_size=batch_size, epochs=epochs,
                            validation_data=(X_v, y_valid), verbose=1,
                            callbacks=[early_stopping, model_checkpoint])

    keras_model = load_model(directory+"model_" + str(fold) + ".hdf5", custom_objects={'root_mean_squared_error': root_mean_squared_error})
    
    
    return keras_model

In [21]:
from sklearn.model_selection import KFold, StratifiedKFold

batch_size = 1024
epochs = 100
models = []

folds = 5

kf = KFold(n_splits=folds)

for fold_n, (train_index, valid_index) in enumerate(kf.split(train, train['month'])):
    print('Fold:', fold_n)
    X_train, X_valid = train.iloc[train_index], train.iloc[valid_index]
    y_train, y_valid = target.iloc[train_index], target.iloc[valid_index]
    X_t = get_keras_data(X_train, numericals, categoricals)
    X_v = get_keras_data(X_valid, numericals, categoricals)
    del X_train, X_valid
    
    keras_model = get_model()
    mod = train_model(keras_model, X_t, y_train, batch_size, epochs, X_v, y_valid, fold_n, patience=3)
    models.append(mod)
    
    print('*'* 50)
    del mod, X_t, X_v

Fold: 0
Train on 25044710 samples, validate on 6261178 samples
Epoch 1/100

Epoch 00001: val_root_mean_squared_error improved from inf to 0.81874, saving model to nn_leak/model_0.hdf5
Epoch 2/100

Epoch 00002: val_root_mean_squared_error did not improve from 0.81874
Epoch 3/100

Epoch 00003: val_root_mean_squared_error did not improve from 0.81874
Epoch 4/100

Epoch 00004: val_root_mean_squared_error improved from 0.81874 to 0.81517, saving model to nn_leak/model_0.hdf5
Epoch 5/100

Epoch 00005: val_root_mean_squared_error improved from 0.81517 to 0.81516, saving model to nn_leak/model_0.hdf5
Epoch 6/100

Epoch 00006: val_root_mean_squared_error improved from 0.81516 to 0.81177, saving model to nn_leak/model_0.hdf5
Epoch 7/100

Epoch 00007: val_root_mean_squared_error did not improve from 0.81177
Epoch 8/100

Epoch 00008: val_root_mean_squared_error did not improve from 0.81177
Epoch 9/100

Epoch 00009: val_root_mean_squared_error did not improve from 0.81177
Epoch 00009: early stoppin


Epoch 00004: val_root_mean_squared_error did not improve from 1.01213
Epoch 5/100

Epoch 00005: val_root_mean_squared_error did not improve from 1.01213
Epoch 6/100

Epoch 00006: val_root_mean_squared_error did not improve from 1.01213
Epoch 00006: early stopping
**************************************************
Fold: 4
Train on 25044711 samples, validate on 6261177 samples
Epoch 1/100

Epoch 00001: val_root_mean_squared_error improved from inf to 0.72810, saving model to nn_leak/model_4.hdf5
Epoch 2/100

Epoch 00002: val_root_mean_squared_error improved from 0.72810 to 0.72787, saving model to nn_leak/model_4.hdf5
Epoch 3/100

Epoch 00003: val_root_mean_squared_error did not improve from 0.72787
Epoch 4/100

Epoch 00004: val_root_mean_squared_error improved from 0.72787 to 0.72500, saving model to nn_leak/model_4.hdf5
Epoch 00004: early stopping
**************************************************


In [22]:
del train, target, y_train, y_valid, kf
gc.collect()

25

In [23]:
test = pd.read_csv("../test.csv")
test = test.merge(building_df, left_on = "building_id", right_on = "building_id", how = "left")
del building_df

weather_test = pd.read_csv("../weather_test.csv")
weather_test = fill_weather_dataset(weather_test)
weather_test = create_lag_features(weather_test, 6)

test = test.merge(weather_test, left_on = ["site_id", "timestamp"], right_on = ["site_id", "timestamp"], how = "left")
del weather_test

gc.collect()

0

In [24]:
test["timestamp"] = pd.to_datetime(test["timestamp"])
test["hour"] = test["timestamp"].dt.hour
test["weekday"] = test["timestamp"].dt.weekday

test['month'] = test['timestamp'].dt.month
test['month'].replace((1, 2, 3, 12), 1, inplace = True)
test['month'].replace((6, 7, 8, 9), 2, inplace = True)
test['month'].replace((10, 11, 4, 5), 3, inplace = True)

test = set_holiday(test)
test = set_holiday2(test)

test["puse_hour"] = test["primary_use"].astype(str) + '_' + test["hour"].astype(str)
test["holiday_hour"] = test["is_holiday"].astype(str) + '_' + test["hour"].astype(str)
encode_categorical_features(test, encoded_features, encoders)
    
test = test[feat_cols]
test, NAlist = reduce_mem_usage(test)

Memory usage of properties dataframe is : 10895.864868164062  MB
******************************
Column:  site_id
dtype before:  int64
min for this col:  0
max for this col:  15
dtype after:  uint8
******************************
******************************
Column:  building_id
dtype before:  int64
min for this col:  0
max for this col:  1448
dtype after:  uint16
******************************
******************************
Column:  primary_use
dtype before:  int64
min for this col:  0
max for this col:  15
dtype after:  uint8
******************************
******************************
Column:  hour
dtype before:  int64
min for this col:  0
max for this col:  23
dtype after:  uint8
******************************
******************************
Column:  weekday
dtype before:  int64
min for this col:  0
max for this col:  6
dtype after:  uint8
******************************
******************************
Column:  meter
dtype before:  int64
min for this col:  0
max for this col:  3
dtyp

dtype after:  float32
******************************
******************************
Column:  humidity_diff1
dtype before:  float64
min for this col:  -64.73757530896768
max for this col:  65.50396803652728
dtype after:  float32
******************************
******************************
Column:  heat_index_diff1
dtype before:  float64
min for this col:  -28.740000000000002
max for this col:  30.09
dtype after:  float32
******************************
******************************
Column:  is_holiday
dtype before:  int64
min for this col:  0
max for this col:  1
dtype after:  uint8
******************************
******************************
Column:  month
dtype before:  int64
min for this col:  1
max for this col:  3
dtype after:  uint8
******************************
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  6799.974060058594  MB
This is  62.40875912408759 % of the initial size


In [25]:
from tqdm import tqdm
i=0
res=[]
step_size = 50000
for j in tqdm(range(int(np.ceil(test.shape[0]/50000)))):
    for_prediction = get_keras_data(test.iloc[i:i+step_size], numericals, categoricals)
    res.append(np.expm1(sum([model.predict(for_prediction, batch_size=1024) for model in models])/folds))
    i+=step_size

100%|██████████| 834/834 [14:20<00:00,  1.03s/it]


In [26]:
res = np.concatenate(res)
submission = pd.read_csv('../sample_submission.csv')
submission['meter_reading'] = res
submission.loc[submission['meter_reading']<0, 'meter_reading'] = 0
submission.to_csv(f'{filename}.csv', index=False)
submission

Unnamed: 0,row_id,meter_reading
0,0,191.884705
1,1,85.250664
2,2,8.288164
3,3,154.256226
4,4,1100.767090
...,...,...
41697595,41697595,5.580867
41697596,41697596,4.823460
41697597,41697597,8.191454
41697598,41697598,165.286911


In [None]:
# def build_model():
    
#     model_input = Input(shape=(6,))
#     num_input = Lambda(lambda x: x[:, :5], output_shape=(5,))(model_input)
#     cat_input = Lambda(lambda x: x[:, 5:], output_shape=(1,))(model_input)
    
#     x = num_input
#     x = Dense(256)(num_input)
#     x = PReLU()(x)
#     x = BatchNormalization()(x)
#     x = Dropout(rate=0.5)(x)

#     emb_dim = 128
#     y = Embedding(200, emb_dim, input_length=1)(cat_input)
#     y = Flatten()(y)
    
#     z = Concatenate()([x, y])
#     z = Dense(256)(z)
#     z = PReLU()(z)
#     z = BatchNormalization()(z)
#     z = Dropout(rate=0.5)(z)
#     z = Dense(256)(z)
#     z = PReLU()(z)
#     z = BatchNormalization()(z)
#     z = Dropout(rate=0.5)(z)
#     z = Dense(256)(z)
#     z = PReLU()(z)
#     z = BatchNormalization()(z)
#     output = Dense(1, activation="sigmoid")(z)
#     model = Model(inputs=model_input, outputs=output)
#     return model

In [None]:
# def get_model_3():
#     inp = keras.layers.Input((num_features*num_preds,))
#     x = keras.layers.Reshape((num_features*num_preds,1))(inp)
#     x = keras.layers.Conv1D(32,num_preds,strides=num_preds, activation='elu')(x)
#     x = keras.layers.BatchNormalization()(x)
#     x = keras.layers.Conv1D(24,1, activation='elu')(x)
#     x = keras.layers.BatchNormalization()(x)
#     x = keras.layers.Conv1D(16,1, activation='elu')(x)
#     x = keras.layers.BatchNormalization()(x)
#     x = keras.layers.Conv1D(4,1, activation='elu')(x)
#     x = keras.layers.Flatten()(x)
#     x = keras.layers.Reshape((num_features*4,1))(x)
#     x = keras.layers.AveragePooling1D(2)(x)
#     x = keras.layers.Flatten()(x)
#     x = keras.layers.BatchNormalization()(x)
#     out = keras.layers.Dense(1, activation='sigmoid')(x)
#     return keras.Model(inputs=inp, outputs=out)