In [1]:
# given nn_file to run not know a lot will study
from keras.models import Model, load_model
from keras.layers import Input, Dropout, Dense, Embedding, SpatialDropout1D, concatenate, BatchNormalization, Flatten
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras import backend as K
from keras.models import Model
from keras.losses import mean_squared_error as mse_loss

from keras import optimizers
from keras.optimizers import RMSprop, Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

Using TensorFlow backend.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import datetime
import gc

In [3]:
df_train = pd.read_csv('../Large_output/train_merge.csv')

In [4]:
def reduce_mem_usage(df):
    start_mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in df.columns:
        if df[col].dtype != object:  # Exclude strings            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",df[col].dtype)            
            # make variables for Int, max and min
            IsInt = False
            mx = df[col].max()
            mn = df[col].min()
            print("min for this col: ",mn)
            print("max for this col: ",mx)
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(df[col]).all(): 
                NAlist.append(col)
                df[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = df[col].fillna(0).astype(np.int64)
            result = (df[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        df[col] = df[col].astype(np.uint8)
                    elif mx < 65535:
                        df[col] = df[col].astype(np.uint16)
                    elif mx < 4294967295:
                        df[col] = df[col].astype(np.uint32)
                    else:
                        df[col] = df[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)    
            # Make float datatypes 32 bit
            else:
                df[col] = df[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",df[col].dtype)
            print("******************************")
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return df, NAlist

In [5]:
def features_engineering(df):
    
    # Sort by localtime
    df.sort_values("local_time")
    df.reset_index(drop=True)
    
    # Add more features
    df["local_time"] = pd.to_datetime(df["local_time"],format="%Y-%m-%d %H:%M:%S")
    df["hour"] = df["local_time"].dt.hour
    df["weekend"] = df["local_time"].dt.weekday
    df['square_feet'] =  np.log1p(df['square_feet'])
    
    
    # Encode Categorical Data
    le = LabelEncoder()
    df["primary_use"] = le.fit_transform(df["primary_use"])
    
    return df

In [6]:
df_train, NA_list = reduce_mem_usage(df_train)
train_engineer = features_engineering(df_train)
train_engineer.loc[(train_engineer['site_id']==0) & (train_engineer['meter']==0),'meter_reading']\
=train_engineer.loc[(train_engineer['site_id']==0) & (train_engineer['meter']==0),'meter_reading'].mul(0.2931)
target = np.log1p(train_engineer["meter_reading"])
features = train_engineer[['building_id', 'meter','site_id','primary_use', 
                          'square_feet','air_temperature','cloud_coverage',
                          'dew_temperature','precip_depth_1_hr','hour', 'weekend','is_holiday']]
del df_train

Memory usage of properties dataframe is : 2271.9295196533203  MB
******************************
Column:  building_id
dtype before:  int64
min for this col:  0
max for this col:  1448
dtype after:  uint16
******************************
******************************
Column:  meter
dtype before:  int64
min for this col:  0
max for this col:  3
dtype after:  uint8
******************************
******************************
Column:  meter_reading
dtype before:  float64
min for this col:  0.0
max for this col:  880374.0
dtype after:  float32
******************************
******************************
Column:  site_id
dtype before:  int64
min for this col:  0
max for this col:  15
dtype after:  uint8
******************************
******************************
Column:  square_feet
dtype before:  int64
min for this col:  283
max for this col:  875000
dtype after:  uint32
******************************
******************************
Column:  year_built
dtype before:  float64
min for this 

In [8]:
categoricals = ["building_id", "site_id", "meter", "primary_use",  "weekend",'is_holiday','hour']

numericals = ['square_feet','air_temperature','cloud_coverage','dew_temperature','precip_depth_1_hr']

feat_cols = categoricals + numericals

In [9]:
def model(dense_dim_1=64, dense_dim_2=32, dense_dim_3=32, dense_dim_4=16, 
dropout1=0.2, dropout2=0.1, dropout3=0.1, dropout4=0.1, lr=0.001):

    #Inputs
    site_id = Input(shape=[1], name="site_id")
    building_id = Input(shape=[1], name="building_id")
    meter = Input(shape=[1], name="meter")
    primary_use = Input(shape=[1], name="primary_use")
    square_feet = Input(shape=[1], name="square_feet")
    air_temperature = Input(shape=[1], name="air_temperature")
    cloud_coverage = Input(shape=[1], name="cloud_coverage")
    dew_temperature = Input(shape=[1], name="dew_temperature")
    hour = Input(shape=[1], name="hour")
    precip = Input(shape=[1], name="precip_depth_1_hr")
    weekend = Input(shape=[1], name="weekend")
    is_holiday = Input(shape=[1], name="is_holiday")
   
    #Embeddings layers
    emb_site_id = Embedding(16, 2)(site_id)
    emb_building_id = Embedding(1449, 6)(building_id)
    emb_meter = Embedding(4, 2)(meter)
    emb_primary_use = Embedding(16, 2)(primary_use)
    emb_hour = Embedding(24, 3)(hour)
    emb_weekend= Embedding(7, 2)(weekend)
    emb_is_holiday= Embedding(2, 2)(is_holiday)

    concat_emb = concatenate([
           Flatten() (emb_site_id)
         , Flatten() (emb_building_id)
         , Flatten() (emb_meter)
         , Flatten() (emb_primary_use)
         , Flatten() (emb_hour)
         , Flatten() (emb_weekend)
         , Flatten() (emb_is_holiday)
    ])
    
    categ = Dropout(dropout1)(Dense(dense_dim_1,activation='relu') (concat_emb))
    categ = BatchNormalization()(categ)
    categ = Dropout(dropout2)(Dense(dense_dim_2,activation='relu') (categ))
    
    #main layer
    main_l = concatenate([
          categ
        , square_feet
        , air_temperature
        , cloud_coverage
        , dew_temperature
        , precip
    ])
    
    main_l = Dropout(dropout3)(Dense(dense_dim_3,activation='relu') (main_l))
    main_l = BatchNormalization()(main_l)
    main_l = Dropout(dropout4)(Dense(dense_dim_4,activation='relu') (main_l))
    
    #output
    output = Dense(1) (main_l) 

    model = Model([ site_id,
                    building_id, 
                    meter, 
                    primary_use, 
                    square_feet, 
                    air_temperature,
                    cloud_coverage,
                    dew_temperature, 
                    hour,
                    weekend, 
                    precip,
                    is_holiday], output)

    model.compile(optimizer = Adam(lr=lr),
                  loss= mse_loss,
                  metrics=[root_mean_squared_error])
    return model

def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true), axis=0))

In [10]:
def get_keras_data(df, num_cols, cat_cols):
    cols = num_cols + cat_cols
    X = {col: np.array(df[col]) for col in cols}
    return X

def train_model(keras_model, X_t, y_train, batch_size, epochs, X_v, y_valid, fold, patience=3):
    early_stopping = EarlyStopping(patience=patience, verbose=1)
    model_checkpoint = ModelCheckpoint("model_" + str(fold) + ".hdf5",
                                       save_best_only=True, verbose=1, monitor='val_root_mean_squared_error', mode='min')

    hist = keras_model.fit(X_t, y_train, batch_size=batch_size, epochs=epochs,
                            validation_data=(X_v, y_valid), verbose=1,
                            callbacks=[early_stopping, model_checkpoint])

    keras_model = load_model("model_" + str(fold) + ".hdf5", custom_objects={'root_mean_squared_error': root_mean_squared_error})
    
    return keras_model

In [16]:
from sklearn.model_selection import KFold, StratifiedKFold

oof = np.zeros(len(features))
batch_size = 1024
epochs = 10
models = []

folds = 3
seed = 666

kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)

for fold_n, (train_index, valid_index) in enumerate(kf.split(features, features['building_id'])):
    print('Fold:', fold_n)
    X_train, X_valid = features.iloc[train_index], features.iloc[valid_index]
    y_train, y_valid = target.iloc[train_index], target.iloc[valid_index]
    X_t = get_keras_data(X_train, numericals, categoricals)
    X_v = get_keras_data(X_valid, numericals, categoricals)
    
    keras_model = model(dense_dim_1=64, dense_dim_2=32, dense_dim_3=32, dense_dim_4=16, 
                        dropout1=0.2, dropout2=0.1, dropout3=0.1, dropout4=0.1, lr=0.001)
    mod = train_model(keras_model, X_t, y_train, batch_size, epochs, X_v, y_valid, fold_n, patience=3)
    models.append(mod)
    print('*'* 50)

Fold: 0


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 13234529 samples, validate on 6617893 samples
Epoch 1/10

Epoch 00001: val_root_mean_squared_error improved from inf to 0.94522, saving model to model_0.hdf5
Epoch 2/10

Epoch 00002: val_root_mean_squared_error did not improve from 0.94522
Epoch 3/10

Epoch 00003: val_root_mean_squared_error improved from 0.94522 to 0.94295, saving model to model_0.hdf5
Epoch 4/10

Epoch 00004: val_root_mean_squared_error improved from 0.94295 to 0.93228, saving model to model_0.hdf5
Epoch 5/10

Epoch 00005: val_root_mean_squared_error improved from 0.93228 to 0.92852, saving model to model_0.hdf5
Epoch 6/10

Epoch 00006: val_root_mean_squared_error improved from 0.92852 to 0.92353, saving model to model_0.hdf5
Epoch 7/10

Epoch 00007: val_root_mean_squared_error did not improve from 0.92353
Epoch 8/10

Epoch 00008: val_root_mean_squared_error did not improve from 0.92353
Epoch 9/10

Epoch 00009: val_root_mean_squared_error did not improve from 0.92353
Epoch 00009: early stopping


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


**************************************************
Fold: 1


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 13234975 samples, validate on 6617447 samples
Epoch 1/10

Epoch 00001: val_root_mean_squared_error improved from inf to 0.97622, saving model to model_1.hdf5
Epoch 2/10

Epoch 00002: val_root_mean_squared_error improved from 0.97622 to 0.95523, saving model to model_1.hdf5
Epoch 3/10

Epoch 00003: val_root_mean_squared_error did not improve from 0.95523
Epoch 4/10

Epoch 00004: val_root_mean_squared_error did not improve from 0.95523
Epoch 5/10

Epoch 00005: val_root_mean_squared_error did not improve from 0.95523
Epoch 00005: early stopping


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


**************************************************
Fold: 2


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 13235340 samples, validate on 6617082 samples
Epoch 1/10

Epoch 00001: val_root_mean_squared_error improved from inf to 0.94727, saving model to model_2.hdf5
Epoch 2/10

Epoch 00002: val_root_mean_squared_error did not improve from 0.94727
Epoch 3/10

Epoch 00003: val_root_mean_squared_error improved from 0.94727 to 0.94288, saving model to model_2.hdf5
Epoch 4/10

Epoch 00004: val_root_mean_squared_error improved from 0.94288 to 0.93989, saving model to model_2.hdf5
Epoch 5/10

Epoch 00005: val_root_mean_squared_error did not improve from 0.93989
Epoch 6/10

Epoch 00006: val_root_mean_squared_error did not improve from 0.93989
Epoch 7/10

Epoch 00007: val_root_mean_squared_error did not improve from 0.93989
Epoch 00007: early stopping


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


**************************************************


In [17]:
(0.92353+0.95523+0.93989)/3

0.93955

In [18]:
test_feature = pd.read_csv('../Large_output/test_merge.csv')
test_feature, NA_list = reduce_mem_usage(test_feature)
test_feature = features_engineering(test_feature)
test_feature = test_feature[['building_id', 'meter','site_id','primary_use', 'square_feet','air_temperature',\
                    'cloud_coverage','dew_temperature','precip_depth_1_hr','hour', 'weekend','is_holiday', 'row_id']]
row_ids = test_feature[['row_id']]
test_feature = test_feature[['building_id', 'meter','site_id','primary_use', 'square_feet','air_temperature',\
                    'cloud_coverage','dew_temperature','precip_depth_1_hr','hour', 'weekend','is_holiday']]

Memory usage of properties dataframe is : 4771.9117431640625  MB
******************************
Column:  building_id
dtype before:  int64
min for this col:  0
max for this col:  1448
dtype after:  uint16
******************************
******************************
Column:  meter
dtype before:  int64
min for this col:  0
max for this col:  3
dtype after:  uint8
******************************
******************************
Column:  site_id
dtype before:  int64
min for this col:  0
max for this col:  15
dtype after:  uint8
******************************
******************************
Column:  square_feet
dtype before:  int64
min for this col:  283
max for this col:  875000
dtype after:  uint32
******************************
******************************
Column:  year_built
dtype before:  float64
min for this col:  1900.0
max for this col:  2017.0
dtype after:  uint16
******************************
******************************
Column:  floor_count
dtype before:  float64
min for this co

In [20]:
from tqdm import tqdm
i=0
res = np.zeros((test_feature.shape[0]),dtype=np.float32)
step_size = 50000
for j in tqdm(range(int(np.ceil(test_feature.shape[0]/step_size)))):
    for_prediction = get_keras_data(test_feature.iloc[i:i+step_size], numericals, categoricals)
    res[i:min(i+step_size,test_feature.shape[0])] = \
       np.expm1(sum([model.predict(for_prediction, batch_size=1024)[:,0] for model in models])/folds)
    i+=step_size

100%|██████████| 834/834 [04:37<00:00,  3.01it/s]


In [21]:
test_feature['meter_reading']=np.clip(res, 0, a_max=None)
test_feature.loc[(test_feature['site_id']==0) & 
                 (test_feature['meter']==0),'meter_reading']=test_feature.loc[(test_feature['site_id']==0) &
                                                            (test_feature['meter']==0),'meter_reading'].mul(3.4118)
df_result = pd.DataFrame({'row_id': row_ids['row_id'], 'meter_reading': test_feature['meter_reading']})
df_result.to_csv('../Large_output/nn_first.csv',index = False)

In [22]:
df_result.shape

(41697600, 2)