In [1]:
from keras.layers import Activation, Dense
from keras.models import Sequential
from keras.optimizers import SGD, Adam

Using TensorFlow backend.


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

In [3]:
train = pd.read_csv('data/train.csv', names=list(map(str, range(56))))

In [4]:
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,46,47,48,49,50,51,52,53,54,55
0,447095,3452,111,22,433,214,3677,252,210,74,...,0,0,0,0,0,0,0,1,0,0
1,113427,3093,95,9,124,7,4115,234,227,124,...,0,0,0,0,0,0,0,0,0,1
2,66435,2551,61,17,90,5,726,231,202,98,...,0,0,0,0,0,0,0,0,0,1
3,8957,2944,135,3,430,13,1868,224,238,149,...,0,0,0,0,0,0,0,0,0,1
4,434631,3030,327,34,277,101,1973,120,181,190,...,0,1,0,0,0,0,0,0,0,1


In [5]:
def normalize(df, cols):
    norms = dict()
    for col in cols:
        norm = StandardScaler()
        norm.fit(np.array(df[col]).reshape(-1,1))
        df[col] = norm.transform(np.array(df[col]).reshape(-1, 1))
        norms[col] = norm
    return df, norms

In [6]:
def dump_csv(model, test_data, ids, filename):
    preds = model.predict(test_data)
    df = pd.DataFrame.from_items([('key', ids), ('score', preds.flatten())])
    df.to_csv(filename, index=False)
    print('CSV saved successfully')

In [7]:
train_normed, norms = normalize(train.copy(), list(map(str, range(1, 11))))



In [8]:
feat_cols = list(map(str, range(1, 55)))
X = train_normed[feat_cols]
y = train_normed['55']

In [9]:
X.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,45,46,47,48,49,50,51,52,53,54
0,1.760238,-0.400681,1.056245,0.767481,2.868112,0.84906,1.490781,-0.67521,-1.792759,1.008587,...,0,0,0,0,0,0,0,0,1,0
1,0.477017,-0.543573,-0.681745,-0.685561,-0.676803,1.129632,0.817984,0.185496,-0.485918,0.717097,...,0,0,0,0,0,0,0,0,0,0
2,-1.460326,-0.847217,0.387787,-0.845442,-0.711054,-1.041279,0.705851,-1.080248,-1.165475,-0.928386,...,0,0,0,0,0,0,0,0,0,0
3,-0.055573,-0.186344,-1.483895,0.753374,-0.574052,-0.309741,0.444208,0.742424,0.167502,-0.590077,...,0,0,0,0,0,0,0,0,0,0
4,0.251828,1.528354,2.660543,0.033907,0.932965,-0.242481,-3.44306,-2.143474,1.239112,-0.21703,...,0,0,1,0,0,0,0,0,0,0


In [10]:
x_train, x_val, y_train, y_val = train_test_split(X, y, random_state=45, test_size=0.2)

In [11]:
x_train.shape

(325367, 54)

In [12]:
model = Sequential()
model.add(Dense(100, input_dim=54))
model.add(Activation('tanh'))
model.add(Dense(50))
model.add(Activation('tanh'))
model.add(Dense(25))
model.add(Activation('tanh'))
model.add(Dense(1))
model.add(Activation('sigmoid'))

In [13]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 100)               5500      
_________________________________________________________________
activation_1 (Activation)    (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 50)                5050      
_________________________________________________________________
activation_2 (Activation)    (None, 50)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 25)                1275      
_________________________________________________________________
activation_3 (Activation)    (None, 25)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 26        
__________

In [14]:
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

In [23]:
model.fit(x_train.as_matrix(), y_train.as_matrix().reshape(-1, 1), epochs=50, verbose=1, 
          validation_data=(x_val.as_matrix(), y_val.as_matrix()), batch_size=3000)

Train on 325367 samples, validate on 81342 samples
Epoch 1/50
 27000/325367 [=>............................] - ETA: 0s - loss: 0.1785 - acc: 0.9281

  """Entry point for launching an IPython kernel.
  


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f6bd4b442b0>

In [24]:
model.save('FeedforwardNet-03-0.93.h5py')

In [17]:
test = pd.read_csv('data/test.csv', names=list(map(str, range(55))))

In [19]:
for col in norms:
    test[col] = norms[col].transform(np.array(test[col]).reshape(-1, 1))



In [20]:
test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45,46,47,48,49,50,51,52,53,54
0,258918,-2.171638,1.22471,1.189936,-0.064844,-0.026046,-1.377581,-2.284355,0.236126,1.892532,...,0,0,0,0,0,0,0,0,0,0
1,258134,-0.606036,-0.606088,-0.815437,0.283134,-0.197298,-0.609531,0.743229,0.185496,-0.381371,...,0,0,0,0,0,0,0,0,0,0
2,165265,-0.477357,-0.364958,-0.949128,-0.506869,-0.385675,-0.852949,0.743229,0.489275,-0.22455,...,0,0,0,0,0,0,0,0,0,0
3,522141,0.981012,-0.168483,0.254095,-0.563298,-0.454176,-0.886259,1.154382,0.438645,-0.79956,...,0,0,0,0,0,0,0,0,0,0
4,569662,-0.534547,-0.87401,1.056245,-0.685561,-0.36855,0.603079,0.668474,-1.789065,-1.635938,...,0,0,0,0,0,0,0,0,0,0


In [21]:
x_test = test[feat_cols]

In [25]:
dump_csv(model, x_test.as_matrix(), test['0'].as_matrix(), 'Experiment-05-FFN.csv')

  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until


CSV saved successfully


In [22]:
model.save('FeedforwardNet-01.h5py')

In [23]:
predictions = model.predict(x_test.as_matrix())

  """Entry point for launching an IPython kernel.


array([0.02121277, 0.9571678 , 0.8251506 , ..., 0.5365085 , 0.93746066,
       0.02533947], dtype=float32)