In [9]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout, BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.layers.noise import GaussianDropout
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import datetime
import gc

In [3]:
def reduce_mem_usage(df):
    start_mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in df.columns:
        if df[col].dtype != object:  # Exclude strings            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",df[col].dtype)            
            # make variables for Int, max and min
            IsInt = False
            mx = df[col].max()
            mn = df[col].min()
            print("min for this col: ",mn)
            print("max for this col: ",mx)
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(df[col]).all(): 
                NAlist.append(col)
                df[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = df[col].fillna(0).astype(np.int64)
            result = (df[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        df[col] = df[col].astype(np.uint8)
                    elif mx < 65535:
                        df[col] = df[col].astype(np.uint16)
                    elif mx < 4294967295:
                        df[col] = df[col].astype(np.uint32)
                    else:
                        df[col] = df[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)    
            # Make float datatypes 32 bit
            else:
                df[col] = df[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",df[col].dtype)
            print("******************************")
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = df.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return df, NAlist

In [8]:
df_merge, NAlist = reduce_mem_usage(pd.read_csv('../output/outlier_remove.csv'))
target = df_merge.logerror
features = df_merge.drop(['logerror'], axis = 1)

Memory usage of properties dataframe is : 58.97638702392578  MB
******************************
Column:  N_ValueRatio
dtype before:  float64
min for this col:  0.009128934
max for this col:  10669.578000000001
dtype after:  float32
******************************
******************************
Column:  N_life
dtype before:  float64
min for this col:  2.0
max for this col:  194.0
dtype after:  uint8
******************************
******************************
Column:  N_zip_count
dtype before:  float64
min for this col:  1.0
max for this col:  22025.0
dtype after:  uint16
******************************
******************************
Column:  airconditioningtypeid
dtype before:  int64
min for this col:  0
max for this col:  7
dtype after:  uint8
******************************
******************************
Column:  basementsqft
dtype before:  float64
min for this col:  0.0
max for this col:  3560.0
dtype after:  uint16
******************************
******************************
Column: 

In [10]:
imputer= Imputer()
imputer.fit(features)
x_train = imputer.transform(features)
sc = StandardScaler()
x_train = sc.fit_transform(x_train)

len_x=int(x_train.shape[1])
print("len_x is:",len_x)

# Neural Network
print("\nSetting up neural network model...")
nn = Sequential()
nn.add(Dense(units = 400 , kernel_initializer = 'normal', input_dim = len_x))
nn.add(PReLU())
nn.add(Dropout(.4))
nn.add(Dense(units = 160 , kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.6))
nn.add(Dense(units = 64 , kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.5))
nn.add(Dense(units = 26, kernel_initializer = 'normal'))
nn.add(PReLU())
nn.add(BatchNormalization())
nn.add(Dropout(.6))
nn.add(Dense(1, kernel_initializer='normal'))
nn.compile(loss='mae', optimizer=Adam(lr=4e-3, decay=1e-4))

y_train = target.values
print("\nFitting neural network model...")
nn.fit(np.array(x_train), np.array(y_train), batch_size = 32, epochs = 70, verbose=2)





len_x is: 46

Setting up neural network model...

Fitting neural network model...
Epoch 1/70
 - 26s - loss: 0.0542
Epoch 2/70
 - 25s - loss: 0.0530
Epoch 3/70
 - 25s - loss: 0.0529
Epoch 4/70
 - 25s - loss: 0.0528
Epoch 5/70
 - 26s - loss: 0.0527
Epoch 6/70
 - 26s - loss: 0.0526
Epoch 7/70
 - 26s - loss: 0.0526
Epoch 8/70
 - 26s - loss: 0.0526
Epoch 9/70
 - 26s - loss: 0.0525
Epoch 10/70
 - 26s - loss: 0.0525
Epoch 11/70
 - 26s - loss: 0.0525
Epoch 12/70
 - 26s - loss: 0.0525
Epoch 13/70
 - 26s - loss: 0.0524
Epoch 14/70
 - 26s - loss: 0.0525
Epoch 15/70
 - 26s - loss: 0.0524
Epoch 16/70
 - 26s - loss: 0.0524
Epoch 17/70
 - 26s - loss: 0.0524
Epoch 18/70
 - 26s - loss: 0.0524
Epoch 19/70
 - 26s - loss: 0.0524
Epoch 20/70
 - 25s - loss: 0.0524
Epoch 21/70
 - 26s - loss: 0.0523
Epoch 22/70
 - 26s - loss: 0.0523
Epoch 23/70
 - 26s - loss: 0.0523
Epoch 24/70
 - 26s - loss: 0.0523
Epoch 25/70
 - 26s - loss: 0.0523
Epoch 26/70
 - 26s - loss: 0.0523
Epoch 27/70
 - 26s - loss: 0.0523
Epoch 28/

NameError: name 'x_test' is not defined

In [23]:
df_sub_2016, NA_list = reduce_mem_usage(pd.read_csv('../output/final_sub_2016.csv').drop_duplicates('parcelid'))
df_sub_2016['year'] = 0
results = {}
for month in [10,11]:
    print(month)
    df_sub_2016['month'] = month
    print(df_sub_2016.shape)
    imputer.fit(df_sub_2016)
    x_test = imputer.transform(df_sub_2016)
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)
    y_pred_ann = nn.predict(x_test)
    results[month] = y_pred_ann
    del x_test
    gc.collect()
for i in results.keys():
    df_sub_2016[str(i)] = results[i]
df_sub_2016[str(12)] = y_pred_ann
df_sub = pd.read_csv('../Resources/sample_submission.csv')
df_sub = df_sub.rename(columns = {'ParcelId': 'parcelid'})
df_sub = pd.merge(df_sub[['parcelid']], df_sub_2016[['parcelid', '10', '11', '12']].drop_duplicates('parcelid'),
                  how = 'left', on = 'parcelid')
df_sub = df_sub.rename(columns = {'10': '201610', '11': '201611', '12': '201612'}).drop_duplicates('parcelid')
del df_sub_2016
gc.collect()
df_sub_2017, NA_list = reduce_mem_usage(pd.read_csv('../output/final_sub_2017.csv').drop_duplicates('parcelid'))
df_sub_2017['year'] = 1
results = {}
for month in [10,11]:
    df_sub_2017['month'] = month
    imputer.fit(df_sub_2017)
    x_test = imputer.transform(df_sub_2017)
    sc = StandardScaler()
    x_train = sc.fit_transform(x_train)
    x_test = sc.transform(x_test)
    y_pred_ann = nn.predict(x_test)
    results[month] = y_pred_ann
for i in results.keys():
    df_sub_2017[str(i)] = results[i]
df_sub_2017[str(12)] = y_pred_ann
df_sub = pd.merge(df_sub[['parcelid', '201610', '201611', '201612']], 
                  df_sub_2017[['parcelid', '10', '11', '12']].drop_duplicates('parcelid'), 
                  how = 'left', on = 'parcelid')
df_sub = df_sub.rename(columns = {'10': '201710', '11': '201711', '12': '201712'})
del df_sub_2017
gc.collect()
df_sub.to_csv('../output/submission/nn_opt.csv', index = False)

Memory usage of properties dataframe is : 1070.443717956543  MB
******************************
Column:  N_ValueRatio
dtype before:  float64
min for this col:  0.000298495780264653
max for this col:  105355.63072025467
dtype after:  float32
******************************
******************************
Column:  N_life
dtype before:  float64
min for this col:  3.0
max for this col:  217.0
dtype after:  uint8
******************************
******************************
Column:  N_zip_count
dtype before:  float64
min for this col:  1.0
max for this col:  22021.0
dtype after:  uint16
******************************
******************************
Column:  airconditioningtypeid
dtype before:  int64
min for this col:  0
max for this col:  7
dtype after:  uint8
******************************
******************************
Column:  basementsqft
dtype before:  float64
min for this col:  0.0
max for this col:  8516.0
dtype after:  uint16
******************************
******************************

dtype after:  uint8
******************************
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  316.0086507797241  MB
This is  29.52127659574468 % of the initial size
10
(2985217, 46)
11
(2985217, 46)
Memory usage of properties dataframe is : 1070.443717956543  MB
******************************
Column:  N_ValueRatio
dtype before:  float64
min for this col:  0.0032190426859692173
max for this col:  106962.28942830618
dtype after:  float32
******************************
******************************
Column:  N_life
dtype before:  float64
min for this col:  2.0
max for this col:  217.0
dtype after:  uint8
******************************
******************************
Column:  N_zip_count
dtype before:  float64
min for this col:  1.0
max for this col:  22025.0
dtype after:  uint16
******************************
******************************
Column:  airconditioningtypeid
dtype before:  int64
min for this col:  0
max for this col:  7
dtype after:  uint8
**************************

dtype after:  uint8
******************************
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  316.0086507797241  MB
This is  29.52127659574468 % of the initial size
