# Helpful function

In [31]:
import base64
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
from sklearn import preprocessing

def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd

def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(
        target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    # Regression
    return df[result].values.astype(np.float32), df[[target]].values.astype(np.float32)

# Experiment

In [32]:
import pandas as pd
import numpy as np

In [90]:
data=pd.read_csv("data/experiment_2.csv")

In [96]:
columns=data.columns
for column in columns:
    sd=data[column].std()
    if sd<1e-10:
        data.drop(column, axis=1, inplace=True)

In [97]:
data.shape

(23788, 1107)

In [98]:
x_var=data.columns[1:]

In [99]:
for column in x_var:
    encode_numeric_zscore(data, column)

In [100]:
data["score"]=data["score"].apply(lambda x: float(x))

In [101]:
from sklearn.model_selection import train_test_split
x, y=to_xy(data, "score")
#x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.25, random_state=66)

In [102]:
import os
from scipy.stats import zscore
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
import io
import requests
from sklearn import metrics
from sklearn.model_selection import KFold
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.layers.advanced_activations import LeakyReLU

In [108]:
kf=KFold(5)
oos_y=[]
oos_pred=[]

fold=0
for train, test in kf.split(x):
    fold+=1
    print("Fold #{}".format(fold))
    
    x_train = x[train]
    y_train = y[train]
    x_test = x[test]
    y_test = y[test]
    
    model = Sequential()
    model.add(Dense(800, input_dim=x.shape[1], activation="relu"))
    model.add(Dense(400))
    model.add(Dense(200))
    model.add(Dense(100))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
    model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],verbose=0,epochs=500, batch_size=128)
    
    pred = model.predict(x_test)
    
    oos_y.append(y_test)
    oos_pred.append(pred)        

    # Measure this fold's RMSE
    score = np.sqrt(metrics.mean_squared_error(pred,y_test))
    print("Fold score (RMSE): {}".format(score))

oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)
score = np.sqrt(metrics.mean_squared_error(oos_pred,oos_y))
print("Final, out of sample score (RMSE): {}".format(score))    

Fold #1
Epoch 00010: early stopping
Fold score (RMSE): 6.604328632354736
Fold #2
Epoch 00030: early stopping
Fold score (RMSE): 6.8514604568481445
Fold #3
Epoch 00017: early stopping
Fold score (RMSE): 6.611143112182617
Fold #4
Epoch 00026: early stopping
Fold score (RMSE): 5.917877197265625
Fold #5
Epoch 00020: early stopping
Fold score (RMSE): 6.540044784545898
Final, out of sample score (RMSE): 6.512477397918701
