In [1]:
# imports
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten, Dropout

Using TensorFlow backend.


In [2]:
# check tensorflow version
tf.VERSION

'1.12.0'

In [3]:
# check keras version
keras.__version__

'2.2.4'

In [10]:
from keras import backend as k_back

In [11]:
# define the AUC metric being used in the competition
def auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    k_back.get_session().run(tf.local_variables_initializer())
    return auc

In [4]:
def load_data(filename, skiprows = 1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=',')

In [56]:
train = load_data('data/train_2008.csv', 1)
test = load_data('data/test_2008.csv', 1)

In [83]:
x_train = train[:, :-1]
y_train = train[:, -1]

x_test = test[:,:]

In [84]:
# look at the shapes
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)

(64667, 382)
(16000, 382)
(64667,)


In [85]:
# clean up data, removing unnecessary columns
x_train = np.delete(x_train, [0,1,2], 1)
x_test = np.delete(x_test, [0,1,2], 1)

In [86]:
# look at the shapes
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)

(64667, 379)
(16000, 379)
(64667,)


In [87]:
# check for nans
print (np.isnan(x_train).any())
print (np.isnan(x_test).any())

False
False


In [88]:
# make list for deletion based on non-unique vals
deletion = []
#tdeletion = []

# normalize everything by iterating over columns
for i in range(x_train.shape[1]):
    # first, check that this column has >1 unique values
    if np.unique(x_train[:,i]).shape[0] == 1:
        # throw away this column for now
        deletion.append(i)
    else:
        low = np.min(x_train[:,i])
        high = np.max(x_train[:,i])
        x_train[:,i] = (x_train[:,i] - low) / (high - low)
        x_test[:,i] = (x_test[:,i] - low) / (high - low)
        
    '''
    if np.unique(x_test[:,i]).shape[0] == 1:
        tdeletion.append(i)
    else:
        tlow = np.min(x_test[:,i])
        thigh = np.max(x_test[:,i])
    '''

# clean up data, removing unnecessary columns
x_train = np.delete(x_train, deletion, 1)
x_test = np.delete(x_test, deletion, 1)

In [89]:
deletion

[9, 11, 13, 44, 55, 126, 127, 128, 132, 133, 134, 251, 255]

In [90]:
#tdeletion

In [91]:
x_train.shape

(64667, 366)

In [92]:
x_test.shape

(16000, 366)

In [95]:
# set up a model
model = Sequential()
model.add(Dense(400, input_dim=x_train.shape[1])) # train with 400 units
model.add(Activation('relu'))
model.add(Dropout(0.3)) # regularization - throw out 30% of weights
model.add(Dense(250)) # train with 300 units
model.add(Activation('relu'))
model.add(Dropout(0.15)) # regularization - throw out 15% of weights
model.add(Dense(350)) # train with 300 units
model.add(Activation('relu'))
model.add(Dense(1)) # dense(1) in order to output one probability value
model.add(Activation('softmax')) # strictly speaking softmax might not be needed? unless value isn't a probability

# print summary of layers and weights
model.summary()

# want a regression, so loss is mean squared error instead of categorical_crossentropy
model.compile(loss='mse', optimizer='rmsprop', metrics=['accuracy', 'mse', auc])

fit = model.fit(x_train, y_train, batch_size=32, epochs=10, verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_13 (Dense)             (None, 400)               146800    
_________________________________________________________________
activation_13 (Activation)   (None, 400)               0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 400)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 250)               100250    
_________________________________________________________________
activation_14 (Activation)   (None, 250)               0         
_________________________________________________________________
dropout_8 (Dropout)          (None, 250)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 350)               87850     
__________

In [96]:
results = model.predict(x_test)

In [97]:
results.shape

(16000, 1)

In [98]:
results

array([[1.],
       [1.],
       [1.],
       ...,
       [1.],
       [1.],
       [1.]], dtype=float32)