In [37]:
import pandas as pd
import pandas_ml as pdml
import numpy as np
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction import DictVectorizer as DV
from sklearn.metrics import roc_auc_score as AUC

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

pd.set_option('display.max_rows', 8)
pd.set_option('display.max_rows', 6)

train_df = pd.read_csv('data3.csv',header=0, index_col=0)
test_df = pd.read_csv('quiz.csv',header=0, index_col=0)
print(train_df.shape)
# convert to pdml.ModelFrame
#train_df = pdml.ModelFrame(train_df,target='label')
#train_df=train_df.dropna()
#test_df = pdml.ModelFrame(test_df)
#test_df=test_df.dropna()
#print (train_df)
print(test_df.shape)

(126837, 52)
(31709, 51)


In [38]:
import numpy as np

def tanh(x):
    return np.tanh(x)

def tanh_deriv(x):
    return 1.0 - np.tanh(x)**2

def logistic(x):
    return 1/(1 + np.exp(-x))

def logistic_derivative(x):
    return logistic(x)*(1-logistic(x))

In [42]:
class NeuralNetwork:
    def __init__(self, layers, activation='tanh'):
        """
        :param layers: A list containing the number of units in each layer.
        Should be at least two values
        :param activation: The activation function to be used. Can be
        "logistic" or "tanh"
        """
        if activation == 'logistic':
            self.activation = logistic
            self.activation_deriv = logistic_derivative
        elif activation == 'tanh':
            self.activation = tanh
            self.activation_deriv = tanh_deriv
        self.weights = []
        for i in range(1, len(layers) - 1):
            self.weights.append((2*np.random.random((layers[i - 1] + 1, layers[i]
                                + 1))-1)*0.25)
        self.weights.append((2*np.random.random((layers[i] + 1, layers[i + 1]))-1)*0.25)
    
    def fit(self, X, y, learning_rate=0.2, epochs=10000):
        # Add column of ones to X
        # This is to add the bias unit to the input layer
        ones = np.atleast_2d(np.ones(X.shape[0]))
        X = np.concatenate((ones.T, X), axis=1)
        X = np.atleast_2d(X)
        y = np.array(y)

        for k in range(epochs):
            i = np.random.randint(X.shape[0])
            a = [X[i]]

            for l in range(len(self.weights)):
                dot_value = np.dot(a[l], self.weights[l])
                activation = self.activation(dot_value)
                a.append(activation)
            error = y[i] - a[-1]
            deltas = [error * self.activation_deriv(a[-1])]

            for l in range(len(a) - 2, 0, -1): # we need to begin at the second to last layer
                deltas.append(deltas[-1].dot(self.weights[l].T)*self.activation_deriv(a[l]))
            deltas.reverse()
            for i in range(len(self.weights)):
                layer = np.atleast_2d(a[i])
                delta = np.atleast_2d(deltas[i])
                self.weights[i] += learning_rate * layer.T.dot(delta)
    
    def predict(self, x):
        x = np.array(x)
        temp = np.ones(x.shape[0]+1)
        temp[0:-1] = x
        a = temp
        for l in range(0, len(self.weights)):
            a = self.activation(np.dot(a, self.weights[l]))
        return a

In [43]:
from sklearn.base import TransformerMixin
class DataFrameImputer(TransformerMixin):
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') else X[c].median() for c in X],
            index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.fill)

feature_columns_to_use = ([x for x in train_df.columns])
feature_columns_to_use.pop(-1)

numeric_cols = ['59','60']
nonnumeric_columns = feature_columns_to_use[0:46]+feature_columns_to_use[48:]

# Join the features from train and test together before imputing missing values,
# in case their distribution is slightly different
big_X = train_df[feature_columns_to_use].append(test_df[feature_columns_to_use])
big_X_imputed = DataFrameImputer().fit_transform(big_X)
#print(big_X_imputed)

In [44]:
# To handle categorical features, we need to change
# them to columns of integer values.
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for feature in nonnumeric_columns:
    big_X_imputed[feature] = le.fit_transform(big_X_imputed[feature])
    
# Scale them to [0, 1] range using Label binarizer
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelBinarizer

# Prepare the inputs for the model
train_X = big_X_imputed[0:train_df.shape[0]].as_matrix()
test_X = big_X_imputed[test_df.shape[0]::].as_matrix()
train_y = train_df['label']
#print (train_X)

In [45]:
train_X -= train_X.min() # normalize the values to bring them into the range 0-1
train_X /= train_X.max()

X_train, X_test, y_train, y_test = train_test_split(train_X, train_y)
labels_train = LabelBinarizer().fit_transform(y_train)
labels_test = LabelBinarizer().fit_transform(y_test)

nn = NeuralNetwork([64,100,10],'tanh')

nn.fit(X_train,labels_train,epochs=30000)
predictions = []
for i in range(X_test.shape[0]):
    o = nn.predict(X_test[i] )
    predictions.append(np.argmax(o))
print (confusion_matrix(y_test,predictions))
print (classification_report(y_test,predictions))

ValueError: shapes (52,) and (65,101) not aligned: 52 (dim 0) != 65 (dim 0)

In [36]:
indx=[x for x in range(1,31710)]
submission = pd.DataFrame({'Id': indx,
                           'Prediction': predictions })
submission.to_csv("submission.csv", index=False, sep=",")

NameError: name 'predictions' is not defined