**Dataset**<hr>
Fashion-MNIST is a dataset of Zalando's article images, consisting of a training set of 60,000 examples and a test set of 10,000 examples. Each example is a 28x28 grayscale image, associated with a label from 10 classes.

Dataset information:
* Each training and test example is assigned to one of the following labels:
    - 0 t-shirt/top
    - 1 trouser
    - 2 pullover
    - 3 dress
    - 4 coat
    - 5 sandal
    - 6 shirt
    - 7 sneaker
    - 8 bag
    - 9 ankle boot
* Each row is a separate image. Column 1 is the class label. Remaining columns are pixel numbers (784 total). Each value is the darkness of the pixel (1 to 255).

**Import libraries**

In [1]:
# Numpy for numerical computing
import numpy as np

# Pandas for Dataframes
import pandas as pd
pd.set_option('display.max_columns',100)

# Matplolib for visualization
from matplotlib import pyplot as plt
# display plots in the notebook
%matplotlib inline

# Seaborn for easier visualization
import seaborn as sns

**Functions**

In [41]:
def SigmoidFunction(z):
    # Apply sigmoid activation function
    return 1/(1 + np.exp(-z))

def SigmoidPrime(z):
    # Derivative of sigmoid function
    return SigmoidFunction(z)*(1-SigmoidFunction(z))

def TanhFunction(x):
    return 2/(1 + np.exp(-2*x)) - 1

def ReLUFunction(x):
    return max(0, x)

def LeakyReLUFunction(x):
    if x > 0:
        LeakyReLU = x
    else:
        LeakyReLU = 0.01*x
    return LeakyReLU

def SoftmaxClassification(x):
    return np.exp(x)/np.exp(x).sum()

def CostFunction(X, y, Theta1, Theta2, beta):
    n = X.shape[0]
    m = X.shape[1]

# Feedfoward: propagate inputs through network
    a1 = np.c_[np.ones(n), X]                      # Include the bias unit
    z2 = np.matmul(a1, Theta1.T)
    a2 = np.c_[np.ones(n), SigmoidFunction(z2)]    # Include the bias unit and activate the layer 2
    z3 = np.matmul(a2, Theta2.T)
    a3 = SoftmaxClassification(z3)                       # Activate the layer 3
    
    # Cost function without regularization
    J = -1/m*np.trace(np.matmul(y.T, np.log(a3)) + np.matmul((1 - y.T), np.log(1 - a3)))

    # Cost function with regularization
    J = J + beta/(2*m)*(np.square(Theta1[:,1:]).sum() + np.square(Theta2[:,1:]).sum())
    
# Backpropagation
    d3 = np.multiply(-(a3 - y), SigmoidPrime(z3))
    d2 = np.multiply(np.matmul(d3, Theta2[:,1:]), SigmoidPrime(z2))

    D1 = np.matmul(d2.T, a1)
    D2 = np.matmul(d3.T, a2)
    
    Theta1_grad = D1/m + beta/m*Theta1
    Theta2_grad = D2/m + beta/m*Theta2
    return J, Theta1_grad, Theta2_grad

def UpdateGradient(Theta1, Theta2, Theta1_grad, Theta2_grad, alpha):
    new_Theta1 = Theta1 - alpha*Theta1_grad
    new_Theta2 = Theta2 - alpha*Theta2_grad
    return new_Theta1, new_Theta2

**Load dataset**

In [3]:
df = pd.read_csv('fashion-mnist_train.csv')

In [4]:
# Dataframe dimensions
df.shape

(60000, 785)

In [5]:
# First 5 rows of data
df.head()

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,pixel10,pixel11,pixel12,pixel13,pixel14,pixel15,pixel16,pixel17,pixel18,pixel19,pixel20,pixel21,pixel22,pixel23,pixel24,pixel25,pixel26,pixel27,pixel28,pixel29,pixel30,pixel31,pixel32,pixel33,pixel34,pixel35,pixel36,pixel37,pixel38,pixel39,pixel40,pixel41,pixel42,pixel43,pixel44,pixel45,pixel46,pixel47,pixel48,pixel49,...,pixel735,pixel736,pixel737,pixel738,pixel739,pixel740,pixel741,pixel742,pixel743,pixel744,pixel745,pixel746,pixel747,pixel748,pixel749,pixel750,pixel751,pixel752,pixel753,pixel754,pixel755,pixel756,pixel757,pixel758,pixel759,pixel760,pixel761,pixel762,pixel763,pixel764,pixel765,pixel766,pixel767,pixel768,pixel769,pixel770,pixel771,pixel772,pixel773,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,6,0,0,0,0,0,0,0,5,0,0,0,105,92,101,107,100,132,0,0,2,4,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,150,227,211,171,191,236,228,172,0,0,0,...,253,0,112,255,217,211,227,245,195,194,211,220,214,74,0,255,222,128,0,0,0,0,0,0,0,0,0,44,12,0,0,40,134,162,191,214,163,146,165,79,0,0,0,30,43,0,0,0,0,0
3,0,0,0,0,1,2,0,0,0,0,0,114,183,112,55,23,72,102,165,160,28,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,24,188,163,93,136,153,168,252,174,136,166,130,123,131,...,0,5,194,193,204,104,116,241,217,196,171,249,207,197,202,45,0,3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,22,21,25,69,52,45,74,39,3,0,0,0,0,1,0,0,0,0
4,3,0,0,0,0,0,0,0,0,0,0,0,0,46,0,21,68,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,25,187,189,0,119,173,239,112,0,189,125,0,0,...,3,0,0,214,232,229,249,245,248,252,230,237,229,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,68,116,112,136,147,144,121,102,63,0,0,0,0,0,0,0,0,0,0


In [6]:
# Create separate object for each target variable
y = df.label
y = y.astype(str)
y = pd.get_dummies(y, columns = ['label']).astype(int)

# Create separate object for input features
X = df.drop('label', axis = 1)
X.shape

(60000, 784)

In [36]:
# Random initialization
units = 28
EPSILON = 1
np.random.seed(123)
Theta1_init = np.random.rand(units, X.shape[1] + 1)*(2*EPSILON) - EPSILON
Theta2_init = np.random.rand(y.shape[1], units + 1)*(2*EPSILON) - EPSILON

# Print Theta1 and Theta2 shapes
print(Theta1_init.shape)
print(Theta2_init.shape)

(28, 785)
(10, 29)


In [30]:
# Dividing the training set into 6 groups
X_dic = {'set_1': X[:10000],
         'set_2': X[10000:20000],
         'set_3': X[20000:30000],
         'set_4': X[30000:40000],
         'set_5': X[40000:50000],
         'set_6': X[50000:]}
y_dic = {'set_1': y[0:10000],
         'set_2': y[10000:20000],
         'set_3': y[20000:30000],
         'set_4': y[30000:40000],
         'set_5': y[40000:50000],
         'set_6': y[50000:]}

Feedfoward the neural network<br>
We first run a "forward pass" to compute all the activations throughout the network, including the output value of the hypothesis h(x). <br>
Then, each node j in layer l, we would like to compute and "error term" deltaj that measures how much that node was "responsible" for any errors in the output.

In [42]:
beta = 0
alpha = 0

Theta1 = Theta1_init
Theta2 = Theta2_init

for key in  X_dic.keys():
    J, Theta1_grad, Theta2_grad = CostFunction(X_dic[key], y_dic[key], Theta1, Theta2, beta)
    Theta1, Theta2 = UpdateGradient(Theta1, Theta2, Theta1_grad, Theta2_grad, alpha)
    print(key, J)

set_1 160.51180837573182
set_2 160.18122380342663
set_3 160.01673142464978
set_4 160.6920440179072
set_5 160.4985426757443
set_6 160.4099110667912
