In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten, Dropout
from keras.wrappers.scikit_learn import KerasRegressor
from keras import backend as k_back
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

Using TensorFlow backend.


In [2]:
# define the AUC metric being used in the competition
# this function can be used in Keras' Sequential Model compilation only
def auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    k_back.get_session().run(tf.local_variables_initializer())
    return auc

In [3]:
def load_data(filename, skiprows = 1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=',')

In [14]:
train = load_data('data/train_2008.csv', 1)
test = load_data('data/test_2008.csv', 1)

In [15]:
x_train = train[:, :-1]
y_train = train[:, -1]

x_test = test[:,:]

In [16]:
# look at the shapes
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)

(64667, 382)
(16000, 382)
(64667,)


In [17]:
merged = np.concatenate((x_train, x_test), axis=0)

In [18]:
merged.shape

(80667, 382)

In [19]:
mmscaler = MinMaxScaler()
merged = mmscaler.fit_transform(merged)

In [20]:
# break up into test and train again
x_train = merged[:64667,:]
x_test = merged[64667:,:]

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)

(64667, 382)
(16000, 382)
(64667,)


In [21]:
# define base model
# NOTE: this is the most complex model, but without softmax - retrying after kerasregressor_adv seemed to work
# removed softmax because i don't want everything to sum to 1
def easy_model():
    model = Sequential()
    model.add(Dense(400, input_dim=x_train.shape[1], kernel_initializer='normal')) # train with 400 units
    model.add(Activation('relu'))
    model.add(Dropout(0.3)) # regularization - throw out 30% of weights
    model.add(Dense(250, kernel_initializer='normal')) # train with 300 units
    model.add(Activation('relu'))
    model.add(Dropout(0.15)) # regularization - throw out 15% of weights
    model.add(Dense(350, kernel_initializer='normal')) # train with 300 units
    model.add(Activation('relu'))
    # Dense(10) in order to conform with the binary encoding
    model.add(Dense(1, kernel_initializer='normal'))    

    # print summary of layers and weights
    model.summary()
    
    # want a regression, so loss is mean squared error instead of categorical_crossentropy
    model.compile(loss='mean_squared_error', optimizer='adam')
    
    return model

In [22]:
# evaluate model with standardized dataset
easy_estimator = KerasRegressor(build_fn=easy_model, epochs=10, batch_size=32, verbose=0)

In [24]:
# perform cross validation
easy_kfold = KFold(n_splits=10)
easy_results = cross_val_score(easy_estimator, x_train, y_train, cv=easy_kfold)
print("Results: %.2f (%.2f) MSE" % (easy_results.mean(), easy_results.std()))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 400)               153200    
_________________________________________________________________
activation_4 (Activation)    (None, 400)               0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 400)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 250)               100250    
_________________________________________________________________
activation_5 (Activation)    (None, 250)               0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 250)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 350)               87850     
__________

In [25]:
# fit to train, predict both train and test
easy_estimator.fit(x_train, y_train)
easy_y_train = easy_estimator.predict(x_train)
easy_y_test = easy_estimator.predict(x_test)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_45 (Dense)             (None, 400)               153200    
_________________________________________________________________
activation_34 (Activation)   (None, 400)               0         
_________________________________________________________________
dropout_23 (Dropout)         (None, 400)               0         
_________________________________________________________________
dense_46 (Dense)             (None, 250)               100250    
_________________________________________________________________
activation_35 (Activation)   (None, 250)               0         
_________________________________________________________________
dropout_24 (Dropout)         (None, 250)               0         
_________________________________________________________________
dense_47 (Dense)             (None, 350)               87850     
__________

In [26]:
print (easy_y_train.shape)
print (easy_y_test.shape)

(64667,)
(16000,)


In [27]:
# look at numbers of predictions out of bounds
print (np.sum(np.array(easy_y_train) < 0))
print (np.sum(np.array(easy_y_train) > 1))
print (np.sum(np.array(easy_y_test) < 0))
print (np.sum(np.array(easy_y_test) > 1))

1515
31
405
5


In [28]:
# reshape to (x, 1) and normalize to (0,1)
norm_y_train = easy_y_train.reshape(64667, 1)
norm_y_test = easy_y_test.reshape(16000, 1)

# normalize output data
norm_y_train = mmscaler.fit_transform(norm_y_train)
norm_y_test = mmscaler.fit_transform(norm_y_test)

# check shapes
print (norm_y_train.shape)
print (norm_y_test.shape)

(64667, 1)
(16000, 1)


In [29]:
roc_auc_score(y_train, norm_y_train)

0.7832419006982481

In [29]:
# output to file
ids = test[:,:1]
ids = ids.astype(int)
predictions = norm_y_test.astype(float)
out = np.concatenate((ids, predictions), axis=1)

np.savetxt('adv_submission.csv', out, delimiter=',', fmt='%1.10f', header = 'id,target')
# NOTE: REMEMBER TO REMOVE THE # SYMBOL FROM THE ID HEADER BEFORE SUBMITTING