In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from __future__ import absolute_import, division, print_function
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.regularizers import l2
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from keras.layers.advanced_activations import SReLU
print('The scikit-learn version is {}.'.format(sklearn.__version__))

Using TensorFlow backend.


The scikit-learn version is 0.18.1.


In [2]:
#Load the training and test files
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')
print('training: ', df_train.shape)
print('test: ', df_test.shape)

training:  (188318, 132)
test:  (125546, 131)


In [3]:
# Capture column names in variables
cols = df_train.columns
cols_test = df_test.columns 

In [4]:
cols

Index(['id', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
       'cat9',
       ...
       'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12',
       'cont13', 'cont14', 'loss'],
      dtype='object', length=132)

In [5]:
# Separate features and target within training set
# Drop ID column from training data
X_train = df_train
y_train = X_train[cols[-1]]
X_train = X_train[cols[1:-1]]

# Drop ID column from test data
X_test = df_test
X_test = X_test[cols_test[1:]]

# Reset column variables
cols = X_train.columns
cols_test = X_test.columns 


In [6]:
# First 116 features look to be categorical in nature; Ran into errors just attempting to one hot encode
# training data, fitting model and applying to test set.  Test set has categories that are not in training set.

# Need to one hot encode across training and test sets because some categorical data is unique to either the 
# training or test data sets

# Variable to hold the list of labels for all category features in the train and test data
#
labels = []

# Loop through all the data to find all unique categories in both test and training data and append 
# to list of labels
for i in range(0,116):
    train = X_train[cols[i]].unique()
    test = X_test[cols[i]].unique()
    labels.append(list(set(train) | set(test)))    


In [7]:
#Import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder



In [8]:


#One hot encode all categorical features - training set
cats = []
for i in range(0, 116):
    # Standardize labels by label encoding category data from 0 to n_classes-1
    label_encoder = LabelEncoder()
    label_encoder.fit(labels[i])
    feature = label_encoder.transform(X_train.iloc[:,i])
    feature = feature.reshape(X_train.shape[0], 1)
    # One hot encode
    onehot_encoder = OneHotEncoder(sparse=False,n_values=len(labels[i]))
    feature = onehot_encoder.fit_transform(feature)
    cats.append(feature)

# Make a 2D array from a list of 1D arrays
encoded_cats = np.column_stack(cats)

# Print the shape of the encoded data
print(encoded_cats.shape)

# Concatenate continuous features to newly encoded attributes 
X_train = np.concatenate((encoded_cats,X_train.iloc[:,116:].values),axis=1)
del cats
del feature

del encoded_cats
print(X_train.shape)

(188318, 1176)
(188318, 1190)


In [9]:
# As above...One hot encode all categorical features - test set
cats = []
for i in range(0, 116):
    #Label encode
    label_encoder = LabelEncoder()
    label_encoder.fit(labels[i])
    feature = label_encoder.transform(X_test.iloc[:,i])
    feature = feature.reshape(X_test.shape[0], 1)
    #One hot encode
    onehot_encoder = OneHotEncoder(sparse=False,n_values=len(labels[i]))
    feature = onehot_encoder.fit_transform(feature)
    cats.append(feature)

# Make a 2D array from a list of 1D arrays
encoded_cats = np.column_stack(cats)

# Print the shape of the encoded data
print(encoded_cats.shape)

# Concatenate encoded attributes with continuous attributes
X_test = np.concatenate((encoded_cats,X_test.iloc[:,116:].values),axis=1)
del cats
del feature

del encoded_cats
print(X_test.shape)

(125546, 1176)
(125546, 1190)


In [10]:
r, c = X_train.shape

# split into training & validation data
from sklearn import cross_validation
X_train, X_val, y_train, y_val = cross_validation.train_test_split(X_train, y_train, test_size=0.2, random_state=0)



In [11]:
X_val.shape

(37664, 1190)

In [12]:

def FFNN_model():
    model = Sequential()
    model.add(Dense(400, input_dim = c, W_regularizer=l2(0.1), init = 'he_normal'))
    model.add(SReLU())
    model.add(Dropout(0.4))
    
    model.add(Dense(200, W_regularizer=l2(0.1), init = 'he_normal'))
    model.add(SReLU())
    model.add(Dropout(0.2))
    
    model.add(Dense(1, init = 'he_normal'))
    
    # Use Mean Absolute Error as loss function per Kaggle
    model.compile(loss = 'mae', optimizer = 'adadelta')

    return model

In [13]:
np.random.seed(0)

# Create scikit-learn Pipeline
# 1. Perform standard scaling of continuous data features (won't affect categorical features)
# 2. Fit model
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=FFNN_model, nb_epoch=35, batch_size=40, verbose=2)))
pipeline = Pipeline(estimators)


pipeline.fit(X_train, y_train)


Epoch 1/35
32s - loss: 1325.8625
Epoch 2/35
29s - loss: 1243.0040
Epoch 3/35
29s - loss: 1226.2859
Epoch 4/35
29s - loss: 1215.3820
Epoch 5/35
29s - loss: 1206.0604
Epoch 6/35
29s - loss: 1199.8269
Epoch 7/35
30s - loss: 1193.4209
Epoch 8/35
33s - loss: 1188.2144
Epoch 9/35
27s - loss: 1185.1932
Epoch 10/35
30s - loss: 1180.8084
Epoch 11/35
30s - loss: 1177.4782
Epoch 12/35
30s - loss: 1174.8147
Epoch 13/35
30s - loss: 1171.0135
Epoch 14/35
30s - loss: 1167.6923
Epoch 15/35
30s - loss: 1164.9393
Epoch 16/35
30s - loss: 1161.7153
Epoch 17/35
30s - loss: 1157.9290
Epoch 18/35
30s - loss: 1156.1660
Epoch 19/35
30s - loss: 1152.0563
Epoch 20/35
29s - loss: 1150.0706
Epoch 21/35
30s - loss: 1147.7119
Epoch 22/35
31s - loss: 1145.5323
Epoch 23/35
31s - loss: 1144.1343
Epoch 24/35
31s - loss: 1140.6247
Epoch 25/35
31s - loss: 1138.2110
Epoch 26/35
37s - loss: 1136.1244
Epoch 27/35
43s - loss: 1132.9514
Epoch 28/35
29s - loss: 1133.5916
Epoch 29/35
29s - loss: 1130.6008
Epoch 30/35
29s - loss:

Pipeline(steps=[('standardize', StandardScaler(copy=True, with_mean=True, with_std=True)), ('mlp', <keras.wrappers.scikit_learn.KerasRegressor object at 0x7fc1b3b9d128>)])

In [14]:
pred_val = pipeline.predict(X_val)

result = mean_absolute_error(y_val, pred_val)
result                      
# Activation ..> val score

# SRelU(10) -> 1173.5
# SReLU(20) -> 1166.9
# SReLU(23) -> 1167.8
# SReLU(25) -> 1163.9 --> 1143
# SRelu(30) -> 1166.4 ->> 1146
# SReLU(60) -> 1173.0 --> 1155 (on kaggle)

# Reg(0.01), Epoch(25) -> 1161.7
# Reg(0.01), Epoch(30) -> 1164.6
# Reg(0.10), Epoch(30) -> 1158.9 --> 1143
# Reg(0.10), Epoch(35) 1118 -> 1159.6



1159.9875111092931

In [15]:
pred_targets = pipeline.predict(X_test)

In [16]:
# pred_targets

In [17]:

output = pd.DataFrame({
        "ID": df_test["id"],
        "loss": pred_targets[:]
    })
output.to_csv("../input/output.csv", index=False)



PermissionError: [Errno 13] Permission denied: '../input/output.csv'