## Assignment 3

In [2]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
import random
np.random.seed(1)

#### load the data

In [3]:
train_data = pd.read_csv('./ex3_train.csv')
test_data = pd.read_csv('./ex3_test.csv')

#### shuffle the dataframe

In [4]:
train_data_shuffle = shuffle(train_data, random_state = 1)
test_data_shuffle = shuffle(test_data, random_state = 1)

#### split the data into X and y

In [5]:
X_train = train_data.iloc[:, train_data.columns != 'y']
y_train = train_data["y"]
X_test = test_data.iloc[:, test_data.columns != 'y']
y_test = test_data["y"]

X_train_array = np.asarray(X_train)
y_train_array = np.asarray(y_train)
X_test_array = np.asarray(X_test)
y_test_array = np.asarray(y_test)

#### One-hot encoding 

In [6]:
def one_hot_encoding(y, numOfClasses):
    encoded = np.zeros((y.shape[0],numOfClasses))
    for i in range(y.shape[0]):
        encoded[i,y[i]] = 1.0
    return encoded

In [7]:
y_train_encoded = one_hot_encoding(y_train_array, 10)

#### sigmoid function

In [8]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [9]:
input_layer_size = 400
hidden_layer_size = 25
output_layer_size = 10

#### Initialize Parameters

Initialize weights

In [10]:
np.random.seed(1)
def initialize_weights(input_layer_size,hidden_layer_size,output_layer_size):
    w1 = np.random.uniform(-1.0, 1.0, size=hidden_layer_size*input_layer_size).reshape(hidden_layer_size,\
                                                                                       input_layer_size)
    w2 = np.random.uniform(-1.0, 1.0, size=output_layer_size*hidden_layer_size).reshape(output_layer_size,\
                                                                                       hidden_layer_size)
    return (w1, w2)

In [11]:
w1, w2 = initialize_weights(input_layer_size, hidden_layer_size, output_layer_size)

Initializing bias weights to 0

In [12]:
bias1 = np.zeros((hidden_layer_size, 1))
bias2 = np.zeros((output_layer_size, 1))

In [13]:
def forward_propagation(X, w1, w2, b1, b2):
    a1 = X_train_array
    z1 = (np.dot(a1,w1.T).T + b1).T
    a2 = sigmoid(z1)
    z2 = (np.dot(a2,w2.T).T + b2).T
    a3 = sigmoid(z2)
    return a1,a2,a3,z1,z2

In [14]:
a1, a2, a3, z1, z2 = forward_propagation(X_train_array, w1, w2, bias1, bias2)

#### cost function

In [15]:
def compute_cost(prediction, actual):
    return (1/actual.shape[0]) * -np.sum(np.multiply(actual, np.log(prediction)) + np.multiply((1-actual), \
                                                                                               np.log(1-prediction)))

In [16]:
compute_cost(a3, y_train_encoded)

13.245271623870703

#### Backward Propagation

In [17]:
def backward_propagation(a2,a3,X,y,w1,w2,b1,b2):
    m = y.shape[0]
    dw2 = (1/m) * np.dot((a3-y).T, a2)
    db2 = np.sum((a3-y), axis=0, keepdims=True).T
    dw1 = (1/m) * np.dot(np.multiply(np.multiply(np.dot((a3-y),w2),a2), (1-a2)).T, X)
    db1 = (1/m) * np.sum(np.multiply(np.multiply(np.dot((a3-y),w2),a2), (1-a2)).T, axis=1, keepdims=True)
    return dw1, dw2, db1, db2

In [18]:
dw1, dw2, db1, db2 = backward_propagation(a2, a3, X_train_array, y_train_encoded, w1, w2, bias1, bias2)

In [50]:
def gradient_descent(X, y, w1, w2, b1, b2, alpha,ep):
    cost_history = []
    epochs = 0
    a1, a2, a3, z1, z2 = forward_propagation(X, w1, w2, b1, b2)
    newCost = compute_cost(a3, y)
    oldCost = float('inf')
    cost_history.append(newCost)
    while epochs < ep:
#     while newCost < oldCost and abs(newCost - oldCost) > 1e-5 or epochs < ep:
        dw1, dw2, db1, db2 = backward_propagation(a2, a3, X, y, w1, w2, b1, b2)
    
        # update weights
        w1 = w1 - (alpha * dw1)
        w2 = w2 - (alpha * dw2)
        b1 = b1 - (alpha * db1)
        b2 = b2 - (alpha * db2)
        
        oldCost = newCost
        a1, a2, a3, z1, z2 = forward_propagation(X, w1, w2, b1, b2)
        newCost = compute_cost(a3, y)
        cost_history.append(newCost)
        epochs = epochs + 1
        
    min_cost_iter = cost_history.index(min(cost_history))
    print ("Minimum cost at iteration: ", min_cost_iter)
    print ("Minimum cost achieved: ", min(cost_history))
    return cost_history, w1, w2, b1, b2     

In [62]:
def softmax(x):
    mat = np.exp(x)
    return mat / mat.sum(0)

def predict(w1, w2, b1, b2, X):
    m = X.shape[0]
    num_labels = w2.shape[0]
    p = np.zeros((m,1))
    h1 = sigmoid((np.dot(X,w1.T).T + b1).T)
    h2 = sigmoid((np.dot(h1,w2.T).T + b2).T)
    p = softmax(h2)
#     print (p)
    actual_pred = []
    for item in p:
        actual_pred.append(np.argmax(item))
    return actual_pred

def accuracy(actual, predicted):
    count = 0
    for x,y in zip(actual, predicted):
        if x == y:
            count = count + 1
    return count/actual.shape[0]

In [52]:
cost_history01, wl_result01, w2_result01, b1_res01, b2_res01 = gradient_descent(X_train_array,y_train_encoded,w1,w2,\
                                                                                bias1,bias2,0.01,70000)

Minimum cost at iteration:  70000
Minimum cost achieved:  0.408195472028


In [63]:
predictions01 = predict(wl_result01, w2_result01, b1_res01, b2_res01, X_test_array)
accuracy01 = accuracy(y_test_array, predictions01)
print (accuracy01)

0.9106666666666666


In [54]:
cost_history001, wl_result001, w2_result001, b1_res001, b2_res001 = gradient_descent(X_train_array, y_train_encoded, w1, w2, bias1,\
                                                                      bias2, 0.001, 300000)

Minimum cost at iteration:  300000
Minimum cost achieved:  0.660254560303


In [56]:
cost_history1, wl_result1, w2_result1, b1_res1, b2_res1 = gradient_descent(X_train_array,y_train_encoded,w1,w2,\
                                                                                bias1,bias2,0.1,1000)

Minimum cost at iteration:  0
Minimum cost achieved:  13.2452716239


#### Softmax Function

#### Prediction Function

In [40]:
predictions = predict(wl_result01, w2_result01, b1_res01, b2_res01, X_test_array)

In [317]:
predictions1 = predict(wl_result1, w2_result1, b1_res1, b2_res1, X_test_array)

In [332]:
predictions2 = predict(wl_result2, w2_result2, b1_res2, b2_res2, X_test_array)

In [36]:
def accuracy(actual, predicted):
    count = 0
    for x,y in zip(actual, predicted):
        if x == y:
            count = count + 1
    return count/actual.shape[0]

In [37]:
accuracy(y_test_array, predictions)

0.9106666666666666