In [3]:
## import libraries
import numpy as np
np.random.seed(123)

import pandas as pd
import subprocess
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import KFold
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU

In [4]:
## Batch generator

def batch_generator(X, y, batch_size, shuffle):
    number_of_batches = np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    if shuffle:
        np.random.shuffle(sample_index)
    while True:
        batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[batch_index,:].toarray()
        y_batch = y[batch_index]
        counter += 1
        yield X_batch, y_batch
        if (counter == number_of_batches):
            if shuffle:
                np.random.shuffle(sample_index)
            counter = 0

def batch_generatorp(X, batch_size, shuffle):
    number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    while True:
        batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
        X_batch = X[batch_index, :].toarray()
        counter += 1
        yield X_batch
        if (counter == number_of_batches):
            counter = 0

In [5]:
# Read data
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

index = list(train.index)

In [6]:
train.shape

(188318, 132)

In [7]:
train = train.iloc[index]

# set test loss to NaN
test['loss'] = np.nan

shift = 200

y = np.log(train['loss'].values + 200)
id_train = train['id'].values
id_test = test['id'].values

# merge train and test
ntrain = train.shape[0]

train_test = pd.concat((train, test), axis = 0)

## Preprocessing and transforming to sparse data

In [8]:
sparse_data = []

f_cat = [f for f in train_test.columns if 'cat' in f]
for f in f_cat:
    dummy = pd.get_dummies(train_test[f].astype('category'))
    tmp = csr_matrix(dummy)
    sparse_data.append(tmp)
    
f_num = [f for f in train_test.columns if 'cont' in f]

scaler = StandardScaler()
tmp = csr_matrix(scaler.fit_transform(train_test[f_num]))
sparse_data.append(tmp)

del(train_test, train, test)

# Sparese train and test data
xtrain_test = hstack(sparse_data, format='csr')
xtrain = xtrain_test[:ntrain, :]
xtest = xtrain_test[ntrain:,:]
print('Dim train: ', xtrain.shape)
print('Dim test:', xtest.shape)

Dim train:  (188318, 1190)
Dim test: (125546, 1190)


## Define a neural network

In [9]:
def nn_model():
    model = Sequential()
    
    model.add(Dense(400, input_dim=xtrain.shape[1], init='he_normal'))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.4))
    
    model.add(Dense(200, init = 'he_normal'))
    model.add(PReLU())
    model.add(BatchNormalization())    
    model.add(Dropout(0.2))
    
    model.add(Dense(50, init = 'he_normal'))
    model.add(PReLU())
    model.add(BatchNormalization())    
    model.add(Dropout(0.2))
    
    model.add(Dense(1, init = 'he_normal'))
    model.compile(loss = 'mae', optimizer = 'adadelta')
    return(model)

In [10]:
# CV-folds
nfolds = 5

folds = KFold(len(y), n_folds=nfolds, shuffle=True, random_state=111)

## Train models

In [11]:
i = 0
nbags = 10
nepochs = 55
pred_oob = np.zeros(xtrain.shape[0])
pred_test = np.zeros(xtest.shape[0])

for (train_index, test_index) in folds:
    xtr = xtrain[train_index]
    ytr = y[train_index]
    xval = xtrain[test_index]
    yval = y[test_index]
    pred = np.zeros(xval.shape[0])
    for j in range(nbags):
        model = nn_model()
        fit = model.fit_generator(generator = batch_generator(xtr, ytr, 128, True), 
                                  nb_epoch = nepochs, 
                                  samples_per_epoch = xtr.shape[0], 
                                  validation_data = (xval.todense(), yval), 
                                  verbose = 0)
        
        temp = np.exp(model.predict_generator(generator = batch_generatorp(xval, 800, False), 
                                             val_samples = xval.shape[0])[:,0]) - shift
        pred += temp
        print('Fold val bagging score after ', j + 1, "rounds is: ", mean_absolute_error(np.exp(yval) - shift, pred/(j+1)))
        
        pred_test += np.exp(model.predict_generator(generator = batch_generatorp(xtest, 800, False),
                                                   val_samples = xtest.shape[0])[:,0]) - shift
        
    pred /= nbags
    pred_oob[test_index] = pred  
    score = mean_absolute_error(np.exp(yval)-shift, pred)
    i += 1
    print('Fold ', i, '- MAE: ', score)

print('Total - MAE: ', mean_absolute_error(np.exp(y)-shift, pred_oob))

# save train predictions
df = pd.DataFrame({'id':id_train, 'loss': pred_oob})
df.to_csv('preds_oob.csv', index = False)

# Save test predictions
pred_test /= (nfolds*nbags)
df = pd.DataFrame({'id': id_test, 'loss': pred_test})
df.to_csv('submissions/keras_submission.csv', index = False)

Fold val bagging score after  1 rounds is:  1130.91903976
Fold val bagging score after  2 rounds is:  1125.77560597
Fold val bagging score after  3 rounds is:  1126.25250405
Fold val bagging score after  4 rounds is:  1124.61573231
Fold val bagging score after  5 rounds is:  1124.22136505
Fold val bagging score after  6 rounds is:  1124.5312355
Fold val bagging score after  7 rounds is:  1124.18946832
Fold val bagging score after  8 rounds is:  1123.7941781
Fold val bagging score after  9 rounds is:  1123.1772043
Fold val bagging score after  10 rounds is:  1123.05611612
Fold  1 - MAE:  1123.05611612
Fold val bagging score after  1 rounds is:  1138.14570468
Fold val bagging score after  2 rounds is:  1133.32164624
Fold val bagging score after  3 rounds is:  1130.65234603
Fold val bagging score after  4 rounds is:  1129.56675365
Fold val bagging score after  5 rounds is:  1129.81951986
Fold val bagging score after  6 rounds is:  1129.52800859
Fold val bagging score after  7 rounds is:  