In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten, Dropout
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

Using TensorFlow backend.


In [2]:
from keras import backend as k_back

In [3]:
# define the AUC metric being used in the competition
def auc(y_true, y_pred):
    auc = tf.metrics.auc(y_true, y_pred)[1]
    k_back.get_session().run(tf.local_variables_initializer())
    return auc

In [4]:
def load_data(filename, skiprows = 1):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    
    Inputs:
        filename: given as a string.
        
    Outputs:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=skiprows, delimiter=',')

In [36]:
train = load_data('data/train_2008.csv', 1)
test = load_data('data/test_2008.csv', 1)

In [6]:
x_train = train[:, :-1]
y_train = train[:, -1]

x_test = test[:,:]

In [7]:
# look at the shapes
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)

(64667, 382)
(16000, 382)
(64667,)


In [9]:
merged = np.concatenate((x_train, x_test), axis=0)

In [10]:
merged.shape

(80667, 382)

In [11]:
# scale x vals, y vals already in range [0,1]
sc_X = StandardScaler()
merged = sc_X.fit_transform(merged)

'''
sc_Y = StandardScaler()
Y_train = sc_Y.fit_transform(Y)
'''
merged.shape

(80667, 382)

In [12]:
# break up into test and train again
x_train = merged[:64667,:]
x_test = merged[64667:,:]

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)

(64667, 382)
(16000, 382)
(64667,)


In [20]:
# define base model
def easy_model():
    # create model
    model = Sequential()
    model.add(Dense(13, input_dim=x_train.shape[1], kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [21]:
# evaluate model with standardized dataset
estimator = KerasRegressor(build_fn=easy_model, epochs=10, batch_size=32, verbose=0)

In [22]:
kfold = KFold(n_splits=10)
results = cross_val_score(estimator, x_train, y_train, cv=kfold)
print("Results: %.2f (%.2f) MSE" % (results.mean(), results.std()))

Results: -0.18 (0.04) MSE


In [23]:
estimator.fit(x_train, y_train)
prediction = estimator.predict(x_test)

In [24]:
prediction.shape

(16000,)

In [25]:
prediction

array([0.20077398, 0.03065132, 0.13829222, ..., 0.23666885, 0.11164367,
       0.22964296], dtype=float32)

In [26]:
prediction = prediction.reshape(16000, 1)
ids = test[:,:1]
ids = ids.astype(int)
prediction = prediction.astype(float)
test = np.concatenate((ids, prediction), axis=1)

np.savetxt('submission.csv', test, delimiter=',', fmt='%1.10f', header = 'id,target')

In [27]:
# normalize output data
sc_y = StandardScaler()
norm_predict = sc_y.fit_transform(prediction)
norm_predict.shape

(16000, 1)

In [28]:
norm_predict = norm_predict.reshape(16000, 1)
ids = test[:,:1]
ids = ids.astype(int)
norm_predict = norm_predict.astype(float)
out = np.concatenate((ids, norm_predict), axis=1)

np.savetxt('norm_submission.csv', out, delimiter=',', fmt='%1.10f', header = 'id,target')

In [30]:
# try minmaxscaling
from sklearn.preprocessing import MinMaxScaler

In [37]:
x_train2 = train[:, :-1]
y_train2 = train[:, -1]

x_test2 = test[:,:]

In [38]:
merged2 = np.concatenate((x_train2, x_test2), axis=0)
merged2.shape

(80667, 382)

In [40]:
mm_x = MinMaxScaler()
merged2 = mm_x.fit_transform(merged2)

In [41]:
# break up into test and train again
x_train2 = merged2[:64667,:]
x_test2 = merged2[64667:,:]

print(x_train2.shape)
print(x_test2.shape)
print(y_train2.shape)

(64667, 382)
(16000, 382)
(64667,)


In [42]:
# evaluate model with standardized dataset
estimator2 = KerasRegressor(build_fn=easy_model, epochs=10, batch_size=32, verbose=0)

In [43]:
kfold2 = KFold(n_splits=10)
results2 = cross_val_score(estimator2, x_train2, y_train2, cv=kfold2)
print("Results: %.2f (%.2f) MSE" % (results2.mean(), results2.std()))

Results: -0.16 (0.00) MSE


In [44]:
estimator2.fit(x_train2, y_train2)
prediction2 = estimator2.predict(x_test2)

In [45]:
prediction2.shape

(16000,)

In [46]:
prediction2 = prediction2.reshape(16000, 1)
ids2 = test[:,:1]
ids2 = ids2.astype(int)
prediction2 = prediction2.astype(float)
out2 = np.concatenate((ids2, prediction2), axis=1)

np.savetxt('submission.csv', out2, delimiter=',', fmt='%1.10f', header = 'id,target')

In [56]:
np.sum(np.array(prediction2) > 1)

10

In [58]:
# normalize output data
sc_y2 = MinMaxScaler()
norm_predict2 = sc_y2.fit_transform(prediction2)
norm_predict2.shape

(16000, 1)

In [59]:
norm_predict2 = norm_predict2.reshape(16000, 1)
ids3 = test[:,:1]
ids3 = ids3.astype(int)
norm_predict2 = norm_predict2.astype(float)
out3 = np.concatenate((ids3, norm_predict2), axis=1)

np.savetxt('norm_submission.csv', out3, delimiter=',', fmt='%1.10f', header = 'id,target')

In [61]:
training_prediction2 = estimator2.predict(x_train2)

In [62]:
test_prediction2 = estimator2.predict(x_test2)

In [63]:
auc_score = auc(training_prediction2, y_train)

In [64]:
auc_score

<tf.Tensor 'auc/update_op:0' shape=() dtype=float32>

In [65]:
from sklearn.metrics import roc_auc_score

In [67]:
np.sum(np.array(training_prediction2) < 0)

1487

In [69]:
training_prediction2.shape

(64667,)

In [75]:
norm_tr_predict2 = training_prediction2.reshape(64667, 1)

In [76]:
# normalize output data
sc_y2 = MinMaxScaler()
norm_tr_predict2 = sc_y2.fit_transform(norm_tr_predict2)
norm_tr_predict2.shape

(64667, 1)

In [77]:
roc_auc_score(y_train2, norm_tr_predict2)

0.7823736804891283