In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import gc
import time

In [3]:
store = pd.HDFStore('../Data/store_2016.h5')
t1 = time.time()
train = store["train"]
prop = store["prop"]
t2 = time.time()
print 'it took ', t2-t1, ' seconds to read the dataframes'

it took  38.8257939816  seconds to read the dataframes


In [4]:
for col in prop.columns:
    prop[col]=prop[col].fillna(-1)
    train[col]=train[col].fillna(-1)

In [5]:
prop = prop.replace([np.inf, -np.inf], 10000)
train = train.replace([np.inf, -np.inf], 10000)

In [6]:
y = train.logerror
mid = np.percentile(y, 50)
y = y - mid
q1 = np.percentile(y, 25)
q3 = np.percentile(y, 75)
print q1, q3
interval = q3 - q1
fac = 8.0
interval = interval * fac / 2.
hi = interval + mid
lo = -interval + mid
print hi, lo

-0.0313 0.0332
0.264 -0.252


In [7]:
# split the data to 9 months for train and 3 months for test
x1 = train[train.month < 10]    # use for train
x0 = train[train.month > 9]     # use for test
print "Size of the x1 data frame: ", x1.shape
print "Size of the x0 data frame: ", x0.shape

y1 = x1['logerror'].values
y0 = x0['logerror'].values

index_hi = y1 > hi   # drop 1480 points 
index_lo = y1 < lo    # drop 947 points
print sum(index_hi), sum(index_lo)

y1 = y1[(~index_lo) & (~index_hi)]
x1 = x1[(~index_lo) & (~index_hi)]

print "Size of the x1 data frame: ", x1.shape
print "Size of the x0 data frame: ", x0.shape

Size of the x1 data frame:  (81635, 262)
Size of the x0 data frame:  (8515, 262)
2084 1431
Size of the x1 data frame:  (78120, 262)
Size of the x0 data frame:  (8515, 262)


In [8]:
import multiprocessing
from sklearn.metrics import mean_absolute_error

ncpu = multiprocessing.cpu_count()
print "number of cores " + str(ncpu)

number of cores 8


In [9]:
np.random.seed(123)
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, BatchNormalization, Activation
from keras.wrappers.scikit_learn import KerasRegressor
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers.recurrent import LSTM
# define custom R2 metrics for Keras backend
from keras import backend as K
# to tune the NN
from keras.constraints import maxnorm
from keras.optimizers import SGD, Adam

Using Theano backend.


In [10]:
input_dims = x1.shape[1] - 3

In [11]:
def NN_model():
    model = Sequential()
    # Input layer with dimension input_dims and hidden layer i with input_dims neurons. 
    model.add(Dense(input_dims, input_dim=input_dims, kernel_initializer='normal', activation='relu', kernel_constraint=maxnorm(3)))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Activation("linear"))

    # Hidden layer
    model.add(Dense(input_dims//2, kernel_initializer='normal', activation='relu', kernel_constraint=maxnorm(3)))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    model.add(Activation("linear"))
    # Hidden layer
    model.add(Dense(input_dims//4, kernel_initializer='normal', activation='relu', kernel_constraint=maxnorm(3)))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    model.add(Activation("linear"))
    
    # Output Layer.
    model.add(Dense(1))
    # Use a large learning rate with decay and a large momentum. 
    # compile this model
    model.compile(loss='mean_absolute_error', #'mean_squared_error', # one may use 'mean_absolute_error' as alternative
                  optimizer='rmsprop')
    
    # Visualize NN architecture
#    print(model.summary())
    return model

In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=NN_model, nb_epoch=300, batch_size=30, verbose=0)))
model = Pipeline(estimators)

In [13]:
model.fit(x1.drop(["id_parcel", "month", "logerror"], axis=1), y1) # Train the model without outliers

Pipeline(steps=[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('mlp', <keras.wrappers.scikit_learn.KerasRegressor object at 0x7f8d0552c990>)])

In [14]:
print "Error on training data ", mean_absolute_error(y1, model.predict(x1.drop(["id_parcel", "month", "logerror"], axis=1)))
print "Error on 3 months test ", mean_absolute_error(y0, model.predict(x0.drop(["id_parcel", "month", "logerror"], axis=1)))

Error on training data  0.0456814670353
Error on 3 months test  0.0650001437843


In [15]:
score_3months = mean_absolute_error(y0, model.predict(x0.drop(["id_parcel", "month", "logerror"], axis=1)))

# New approach

In [16]:
y = train.logerror
mid = np.percentile(y, 50)
y = y - mid
q1 = np.percentile(y, 25)
q3 = np.percentile(y, 75)
print q1, q3

#fac = 8.0
interval = q3 - q1
interval = interval * fac / 2.
hi_train = interval + mid
lo_train = -interval + mid

fac = 65.0
interval = q3 - q1
interval = interval * fac / 2.
hi_test = interval + mid
lo_test = -interval + mid

print lo_train, hi_train
print lo_test, hi_test

-0.0313 0.0332
-0.252 0.264
-2.09025 2.10225


In [17]:
y = train['logerror'].values
x = train.drop(['month', 'logerror'], axis=1)
print "Size of the train data frame: ", x.shape
print "Size of the prop  data frame: ", prop.shape

print("Generate a list of outliers should be droped for training")
index_hi = y > hi_train   
index_lo = y < lo_train   
print sum(index_hi), sum(index_lo)

outliers_train = []
for ii in range(y.shape[0]):
    if index_hi[ii] or index_lo[ii]:
        outliers_train.append(ii)
        
print("Generate a list of outliers should be droped for testing")
index_hi = y > hi_test   
index_lo = y < lo_test   
print sum(index_hi), sum(index_lo)

outliers_test = []
for ii in range(y.shape[0]):
    if index_hi[ii] or index_lo[ii]:
        outliers_test.append(ii)

Size of the train data frame:  (90150, 260)
Size of the prop  data frame:  (2883630, 260)
Generate a list of outliers should be droped for training
2310 1568
Generate a list of outliers should be droped for testing
51 46


In [18]:
def splitDataFrameIntoSmaller(df, chunkSize = 100000): 
    listOfDf = list()
    numberChunks = len(df) // chunkSize + 1
    for i in range(numberChunks):
        listOfDf.append(i*chunkSize)
    listOfDf.append(len(df))
    return listOfDf

split_index = splitDataFrameIntoSmaller(prop)

In [19]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 44)

train_pred = np.zeros(train.shape[0], dtype=np.float16)
prop_pred = np.zeros(prop.shape[0], dtype=np.float16)
scores1 = []; scores2 = []

N = 10
for _ in range(N):
    for train_index, test_index in skf.split(x, y):

        train_index_wo = [ix for ix in train_index if ix not in outliers_train]
        test_index_wo = [ix for ix in test_index if ix not in outliers_test]

        x1, x0 = x.iloc[train_index_wo], x.iloc[test_index_wo]
        y1, y0 = y[train_index_wo], y[test_index_wo]

        model.fit(x1.drop(["id_parcel"], axis=1), y1) # Train the model without outliers

        #calculate score without second outliers
        scores1.append(mean_absolute_error(y0, model.predict(x0.drop(["id_parcel"], axis=1))))
        print "Score without outliers for the ", len(scores1), " fold is ", scores1[len(scores1)-1]

        #calculate score with outliers
        x0 = x.iloc[test_index]
        y0 = y[test_index]

        pred = model.predict(x0.drop(["id_parcel"], axis=1))
        scores2.append(mean_absolute_error(y0, pred))
    #    print "Score with outliers for the ", len(scores2), " fold is ", scores2[len(scores2)-1]

        for ii, idx in enumerate(test_index):
            train_pred[idx] = pred[ii]

        for ii in range(0, len(split_index)-1):
            n1 = split_index[ii]; n2 = split_index[ii+1]
            pred = model.predict(prop.iloc[n1:n2].drop(['id_parcel'], axis=1))
            prop_pred[n1:n2] += pred
    
print "Average score without outliers over all folds : " , np.mean(scores1), " ", np.std(scores1)
print "Average score with    outliers over all folds : " , np.mean(scores2), " ", np.std(scores2)



Score without outliers for the  1  fold is  0.0672507936222
Score without outliers for the  2  fold is  0.0655911345031
Score without outliers for the  3  fold is  0.0659643245308
Score without outliers for the  4  fold is  0.0647205998296
Score without outliers for the  5  fold is  0.0641033918102
Score without outliers for the  6  fold is  0.0629652857769
Score without outliers for the  7  fold is  0.0642480338151
Score without outliers for the  8  fold is  0.0637069929711
Score without outliers for the  9  fold is  0.0635517238108
Score without outliers for the  10  fold is  0.0628664202224
Score without outliers for the  11  fold is  0.0670981526585
Score without outliers for the  12  fold is  0.0657553239103
Score without outliers for the  13  fold is  0.0658620694301
Score without outliers for the  14  fold is  0.0645993872566
Score without outliers for the  15  fold is  0.0642916278935
Score without outliers for the  16  fold is  0.0630057312579
Score without outliers for the  1

In [20]:
out = pd.DataFrame()
out["ParcelId"] = prop["id_parcel"]
months = ["201610", "201611", "201612", "201710", "201711", "201712"]
for col in months:
    out[col] = map(lambda x: x/(N*10.0), prop_pred)
    
out_train = pd.DataFrame()
out_train["ParcelId"] = train["id_parcel"]
for col in months:
    out_train[col] = train_pred #+ 0.02 #IMPORTANT POINT: I add a constant to train prediction


print("Read the missing")
miss = store["miss"]

med = train.logerror.median()
for col in months:
    miss[col] = med
    
miss = miss[["id_parcel"]+months]
miss.columns = ["ParcelId"] + months

out = pd.concat([out, out_train, miss], axis=0)

from datetime import datetime
out.to_csv('NN.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S')), index=False, float_format='%.4f')

Read the missing


In [22]:
out.describe()

Unnamed: 0,ParcelId,201610,201611,201612,201710,201711,201712
count,2985217.0,2985217.0,2985217.0,2985217.0,2985217.0,2985217.0,2985217.0
mean,13325860.0,0.005789151,0.005789151,0.005789151,0.005789151,0.005789151,0.005789151
std,7909966.0,0.02293646,0.02293646,0.02293646,0.02293646,0.02293646,0.02293646
min,10711720.0,-0.5784375,-0.5784375,-0.5784375,-0.5784375,-0.5784375,-0.5784375
25%,11643710.0,-0.0001000214,-0.0001000214,-0.0001000214,-0.0001000214,-0.0001000214,-0.0001000214
50%,12545090.0,0.004934082,0.004934082,0.004934082,0.004934082,0.004934082,0.004934082
75%,14097120.0,0.01026367,0.01026367,0.01026367,0.01026367,0.01026367,0.01026367
max,169601900.0,11.4,11.4,11.4,11.4,11.4,11.4


In [31]:
print score_3months, np.mean(scores1)

0.443211479551 0.448271219867
