In [1]:
%matplotlib inline
import numpy as np
from sklearn.metrics import classification_report
from sklearn import preprocessing
import time
import theano 
from theano import tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns


In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

def binarizeY(data):
    binarized_data = np.zeros((data.size,8))
    for j in range(0,data.size):
        feature = data[j:j+1]
        i = feature.astype(np.int64) 
        binarized_data[j,i]=1
    return binarized_data

train_labels_b = binarizeY(train_data.Cover_Type)

train_data_scaled = train_data.copy()
test_data_scaled = test_data.copy()

del train_data_scaled['Cover_Type']
del train_data_scaled['Id']

min_max_scaler = preprocessing.MinMaxScaler()

for col in train_data_scaled.keys():
    x = train_data_scaled[col].values.astype(float)
    train_data_scaled[col] = preprocessing.scale(min_max_scaler.fit_transform(x))

In [3]:
train_data_scaled.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40
0,-0.367095,-0.95998,-1.597132,0.146639,-0.834074,-0.908681,0.271454,0.571653,0.281259,4.334805,...,-0.149835,-0.218671,-0.206085,-0.038173,-0.082413,-0.025726,-0.047474,-0.224908,-0.213134,-0.176939
1,-0.381461,-0.914559,-1.715424,-0.072337,-0.932054,-0.999246,0.238732,0.703225,0.346627,4.28571,...,-0.149835,-0.218671,-0.206085,-0.038173,-0.082413,-0.025726,-0.047474,-0.224908,-0.213134,-0.176939
2,0.130912,-0.160577,-0.887379,0.194243,0.227369,1.106379,0.696843,0.834797,-0.002005,4.191156,...,-0.149835,-0.218671,-0.206085,-0.038173,-0.082413,-0.025726,-0.047474,-0.224908,-0.213134,-0.176939
3,0.085421,-0.015231,0.17725,0.070474,1.092853,1.038455,0.827731,0.834797,-0.285268,4.272981,...,-0.149835,-0.218671,-0.206085,-0.038173,-0.082413,-0.025726,-0.047474,-0.224908,-0.213134,-0.176939
4,-0.369489,-1.014485,-1.715424,-0.353198,-0.850404,-0.998491,0.238732,0.659368,0.324838,4.237524,...,-0.149835,-0.218671,-0.206085,-0.038173,-0.082413,-0.025726,-0.047474,-0.224908,-0.213134,-0.176939


In [4]:
test_data_scaled = test_data.copy()

del test_data_scaled['Id']

for col in test_data_scaled.keys():
    x = test_data_scaled[col].values.astype(float)
    test_data_scaled[col] = preprocessing.scale(min_max_scaler.fit_transform(x))


In [7]:
numFeatures = train_data_scaled.shape[1]
numClasses = train_labels_b[1].size
numHiddenNodes = 600 
w_1 = theano.shared(np.asarray((np.random.randn(*(numFeatures, numHiddenNodes))*.01)))
w_2 = theano.shared(np.asarray((np.random.randn(*(numHiddenNodes, numClasses))*.01)))
params = [w_1, w_2]


## (2) Model
X = T.matrix()
Y = T.matrix()
# Two notes:
# First, feed forward is the composition of layers (dot product + activation function)
# Second, activation on the hidden layer still uses sigmoid
def model(X, w_1, w_2):
    return T.nnet.softmax(T.dot(T.nnet.sigmoid(T.dot(X, w_1)), w_2))
y_hat = model(X, w_1, w_2)


## (3) Cost...same as logistic regression
cost = T.mean(T.nnet.categorical_crossentropy(y_hat, Y))


## (4) Minimization.  Update rule changes to backpropagation.
alpha = 0.01
def backprop(cost, w):
    grads = T.grad(cost=cost, wrt=w)
    updates = []
    for w1, grad in zip(w, grads):
        updates.append([w1, w1 - grad * alpha])
    return updates
update = backprop(cost, params)
train = theano.function(inputs=[X, Y], outputs=cost, updates=update, allow_input_downcast=True)
y_pred = T.argmax(y_hat, axis=1)
predict = theano.function(inputs=[X], outputs=y_pred, allow_input_downcast=True)

miniBatchSize = 1 
def gradientDescentStochastic(epochs):
    trainTime = 0.0
    predictTime = 0.0
    start_time = time.time()
    for i in range(epochs):
        for start, end in zip(range(0, len(train_data_scaled), miniBatchSize), range(miniBatchSize, len(train_data), miniBatchSize)):
            cost = train(train_data_scaled[start:end], train_labels_b[start:end])
        trainTime =  trainTime + (time.time() - start_time)
        pred = predict(test_data_scaled)
    print 'train time = %.2f' %(trainTime)
    return pred

pred = gradientDescentStochastic(200)

start_time = time.time()
predict(test_data_scaled)   
print 'predict time = %.2f' %(time.time() - start_time)

train time = 538179.53
predict time = 11.55


In [8]:
pred[:10]

array([4, 4, 4, 4, 4, 4, 4, 4, 4, 4])

In [10]:
test_data.head()
output = pd.DataFrame(test_data['Id'], columns=['Id'])
output['Cover_Type'] = pred

In [11]:
output.to_csv('nn.csv', index = False, index_label = False)

In [2]:
1024*16

16384