In [1]:
import numpy as np
from numpy.matlib import randn
from numpy.random import choice

In [46]:
# ALL CLASSES DEFINED HERE

class Sigmoid:
    
    def forward(self, x):
        self.current_sigma = 1/(1+np.exp(-x))
        return self.current_sigma
    
    def backward(self, gradOutput):
        _g_v = np.multiply(np.multiply(self.current_sigma, 1-self.current_sigma), gradOutput)
        return _g_v
    
    def reset(self):
        self.current_sigma = None
    
    
class Linear:
    
    def __init__(self, inputDim, outputDim, _var = 0.01):
        self.weight = randn(inputDim, outputDim) * _var
        self.bias = randn((1, outputDim)) * _var
        
        self.gradWeight = np.zeros_like(self.weight)
        self.gradBias = np.zeros_like(self.bias)
        
    def forward(self, x):
        return np.dot(x, self.weight) + self.bias
    
    def backward(self, x, gradOutput):
        self.gradWeight = np.dot(x.T, gradOutput)
        self.gradBias = np.dot(np.ones((1,gradOutput.shape[0])),gradOutput)
        return np.dot(gradOutput, self.weight.T)

    def get_parameters(self):
        return (self.weight, self.bias), (self.gradWeight, self.gradBias)
    
    
class MeanSquareError:
            
    def forward(self, predictions, labels):
        self.current_preds = predictions
        self.current_labels = labels
        return np.sum(np.square(predictions - labels))
    
    def backward(self, predictions=0, labels=0):
        if predictions==0:
            predictions = self.current_preds
            labels = self.current_labels
            
        _numberOfSamples = len(labels)
        _f_u = 2*_numberOfSamples*(predictions-labels)
        return _f_u        

class SimpleModel:
    
    def __init__(self, customFunction):
        print "Creating PyDeepTensorNet Model"
        self.layerStack = {}
        self.customFunction = customFunction
        
    def add(self, layerObject, layerName):
        self.layerStack[layerName] = layerObject
        print layerName,'Added'
        
    def run(self, inputX=None, outputY=None, 
            batchSize=1, epochs=30,
            learningRate = 0.001,
            verbose=True, printStep=10, shuffle=False):
        numberOfSamples = inputX.shape[0]
        if verbose:
            print 'Running{0} Stochastic Gradient Descent on {1} samples'.format([' Mini-batch', ' '][batchSize==1], 
                                                                                  numberOfSamples)
        for epoch in range(epochs):
            loss = 0

            for iteration in range(numberOfSamples/batchSize):
                if shuffle:
                    batchRandomIndices = choice(range(numberOfSamples), batchSize)
                    batchInput = inputX[batchRandomIndices]
                    batchOutput = outputY[batchRandomIndices]
                else:
                    batchInput = inputX[iteration*(batchSize):(1+iteration)*batchSize]
                    batchOutput = outputY[iteration*(batchSize):(1+iteration)*batchSize]
                self.layerStack, loss = self.customFunction(self.layerStack, batchInput, batchOutput, learningRate, loss)
            if epoch%printStep==0 and verbose:
                print "Epoch ",epoch,
                print "loss: ",loss/(batchSize*(iteration+1))

In [3]:
x1 = np.array([[1, 2, 2, 3]])
a1 = Sigmoid().forward(Linear(4,3).forward(x1))

x2 = np.array([[4, 5, 2, 1]])
a2 = Sigmoid().forward(Linear(4,3).forward(x2))

x = np.concatenate((x1, x2), axis=0)
a = Sigmoid().forward(Linear(4,3).forward(x))

print 'a1:',a1
print 'a2:',a2
print 'a:',a

a1: [[ 0.4934456   0.49069751  0.49622148]]
a2: [[ 0.50694103  0.49441347  0.47743266]]
a: [[ 0.50801006  0.50408115  0.50736725]
 [ 0.5109484   0.50906305  0.51214163]]


In [4]:
y_pred = np.array([[0.23, 0.25, 0.33], [0.23, 0.25, 0.33], [0.23, 0.25, 0.33], [0.23, 0.25, 0.33]])
y_true = np.array([[0.25, 0.25, 0.25], [0.33, 0.33, 0.33], [0.77, 0.77, 0.77], [0.80, 0.80, 0.80]])
print MeanSquareError().forward(y_pred, y_true)

1.6271


In [5]:
x1 = np.array([[1, 2, 2, 3]])
y1 = np.array([[0.25, 0.25, 0.25]])

linear = Linear(4,3) # Weights initialization does matter.
sigmoid = Sigmoid()
loss = MeanSquareError()

#hacky-abstraction: loss_val = loss.forward(sigmoid.forward(linear.forward(x1)), y1)

a0 = linear.forward(x1)
a1 = sigmoid.forward(a0)
loss_val = loss.forward(a1, y1)

# Backprop
#hacky-abstraction: dx1 = linear.backward(x1, sigmoid.backward(loss.backward()))

da1 = loss.backward()
da0 = sigmoid.backward(da1)
dx1 = linear.backward(x1, da0)

In [6]:
x = np.array([[2.34, 3.8, 34.44, 5.33]])
y = np.array([[3.2, 4.2, 5.3]])

linear = Linear(4, 3)
sigmoid = Sigmoid()
mseLoss = MeanSquareError()

#forward
_ = mseLoss.forward(sigmoid.forward(linear.forward(x)), y)

# backprop
_ = linear.backward(x, sigmoid.backward(mseLoss.backward()))

gradWeight = linear.gradWeight
gradBias = linear.gradBias

approxGradWeight = np.zeros_like(linear.weight)
approxGradBias = np.zeros_like(linear.bias)

EPSILON = 1e-4
updatedLinear = Linear(4, 3)

for i in range(linear.weight.shape[0]):
    for j in range(linear.weight.shape[1]):
        fw = mseLoss.forward(sigmoid.forward(linear.forward(x)), y)
        updatedWeight = np.copy(linear.weight)
        updatedWeight[i, j] = updatedWeight[i, j] + EPSILON
        updatedLinear.bias = linear.bias
        updatedLinear.weight = updatedWeight
        fw_epsilon = mseLoss.forward(sigmoid.forward(updatedLinear.forward(x)), y) # Loss function
        approxGradWeight[i, j] = (fw_epsilon - fw) / EPSILON

# These two outputs should be similar up to some precision.
print 'gradWeight: \n' , gradWeight
print 'approxGradWeight: \n' , approxGradWeight

gradWeight: 
[[ -2.92769612  -3.81950586  -4.91710397]
 [ -4.75437831  -6.20261636  -7.98504063]
 [-43.08968132 -56.21529139 -72.36968401]
 [ -6.66864116  -8.69998557 -11.20007014]]
approxGradWeight: 
[[ -2.92779454  -3.81934894  -4.9168983 ]
 [ -4.75463784  -6.20220251  -7.98449823]
 [-43.11096996 -56.181266   -72.3250914 ]
 [ -6.66915171  -8.69917134 -11.199003  ]]


In [20]:
numberOfSamples = 1000

x = np.random.uniform(0, 6, (numberOfSamples, 4))

# y1 = sin(x0 + x1 + x2 + x3)
y1 = np.sin(x.sum(axis = 1))

# y2 = sin(6*x1)
y2 = np.sin(6*x[:, 1])

# y3 = sin(x1 + x3)
y3 = np.sin(x[:, 1] + x[:, 3])

y = np.array([y1, y2, y3]).T

print x.shape
print y.shape

(1000, 4)
(1000, 3)


In [5]:
def trainModel(layerStack, batchInput, batchOutput, learningRate, loss):
    
    loss += layerStack['mse_loss'].forward(layerStack['sigmoid'].forward(layerStack['linear'].forward(batchInput)), batchOutput)
    _ = layerStack['linear'].backward(batchInput, layerStack['sigmoid'].backward(layerStack['mse_loss'].backward()))

    layerStack['linear'].weight -= layerStack['linear'].gradWeight*learningRate
    layerStack['linear'].bias -= layerStack['linear'].gradBias*learningRate
    
    return layerStack, loss

In [31]:
model = SimpleModel(trainModel)
model.add(Linear(4,3), 'linear')
model.add(Sigmoid(), 'sigmoid')
model.add(MeanSquareError(), 'mse_loss')

model.run(x, y, epochs=400, batchSize=1, learningRate=0.001, shuffle=False)

Creating PyDeepTensorNet Model
linear Added
sigmoid Added
mse_loss Added
Running  Stochastic Gradient Descent on 1000 samples
Epoch  0 loss:  1.61197321373
Epoch  10 loss:  1.47270157282
Epoch  20 loss:  1.46629027538
Epoch  30 loss:  1.45896955882
Epoch  40 loss:  1.44927455945
Epoch  50 loss:  1.44184369877
Epoch  60 loss:  1.43686463925
Epoch  70 loss:  1.43342912556
Epoch  80 loss:  1.43091725141
Epoch  90 loss:  1.42897907133
Epoch  100 loss:  1.4274151241
Epoch  110 loss:  1.42610629622
Epoch  120 loss:  1.42497785987
Epoch  130 loss:  1.42398083866
Epoch  140 loss:  1.42308195808
Epoch  150 loss:  1.42225797761
Epoch  160 loss:  1.4214923412
Epoch  170 loss:  1.42077308711
Epoch  180 loss:  1.42009145389
Epoch  190 loss:  1.41944089137
Epoch  200 loss:  1.41881634758
Epoch  210 loss:  1.41821378539
Epoch  220 loss:  1.41762989658
Epoch  230 loss:  1.41706195805
Epoch  240 loss:  1.41650776042
Epoch  250 loss:  1.41596555446
Epoch  260 loss:  1.4154339919
Epoch  270 loss:  1.4149

## Exercises

# 1. Two-layer model

In [41]:
def trainNewModel(layerStack, batchInput, batchOutput, learningRate, loss):
    linear_1_out = layerStack['linear_1'].forward(batchInput)
    sigmoid_1_out = layerStack['sigmoid_1'].forward(linear_1_out)
    linear_2_out = layerStack['linear_2'].forward(sigmoid_1_out)
    sigmoid_2_out = layerStack['sigmoid_2'].forward(linear_2_out)
    loss_out = layerStack['mse_loss'].forward(sigmoid_2_out, batchOutput)
    
    loss += loss_out
    _ = layerStack['linear_1'].backward(batchInput, layerStack['sigmoid_1'].backward(layerStack['linear_2'].backward(sigmoid_1_out, layerStack['sigmoid_2'].backward(layerStack['mse_loss'].backward()))))
    layerStack['linear_2'].weight -= layerStack['linear_2'].gradWeight*learningRate
    layerStack['linear_2'].bias -= layerStack['linear_2'].gradBias*learningRate
    layerStack['linear_1'].weight -= layerStack['linear_1'].gradWeight*learningRate
    layerStack['linear_1'].bias -= layerStack['linear_1'].gradBias*learningRate
    
    return layerStack, loss

In [5]:
newModel = SimpleModel(trainNewModel)
newModel.add(Linear(4,5),'linear_1')
newModel.add(Sigmoid(),'sigmoid_1')
newModel.add(Linear(5,3), 'linear_2')
newModel.add(Sigmoid(), 'sigmoid_2')
newModel.add(MeanSquareError(), 'mse_loss')

newModel.run(inputX=x, outputY=y, epochs=400, batchSize=1, learningRate=0.01) ## LR of 0.01 to make it converge faster

Creating PyDeepTensorNet Model
linear_1 Added
sigmoid_1 Added
linear_2 Added
sigmoid_2 Added
mse_loss Added
Running  Stochastic Gradient Descent on 1000 samples
Epoch  0 loss:  1.66351357342
Epoch  10 loss:  1.5310219077
Epoch  20 loss:  1.53009423097
Epoch  30 loss:  1.52926828795
Epoch  40 loss:  1.5275236162
Epoch  50 loss:  1.52493683009
Epoch  60 loss:  1.52192270214
Epoch  70 loss:  1.51856136203
Epoch  80 loss:  1.51500364775
Epoch  90 loss:  1.51099631982
Epoch  100 loss:  1.50622008412
Epoch  110 loss:  1.50114796352
Epoch  120 loss:  1.49671717312
Epoch  130 loss:  1.49317121684
Epoch  140 loss:  1.49015863506
Epoch  150 loss:  1.48716978463
Epoch  160 loss:  1.48349514386
Epoch  170 loss:  1.47847453381
Epoch  180 loss:  1.47353366588
Epoch  190 loss:  1.46947229771
Epoch  200 loss:  1.46569469638
Epoch  210 loss:  1.46193635497
Epoch  220 loss:  1.45802448461
Epoch  230 loss:  1.4538261433
Epoch  240 loss:  1.44983177156
Epoch  250 loss:  1.44634464506
Epoch  260 loss:  1.4

# 2. Weight Check

<h3>For x,y given in section 2</h3>

In [62]:
x = np.array([[2.34, 3.8, 34.44, 5.33]])
y = np.array([[3.2, 4.2, 5.3]])

linear_1 = Linear(4, 5)
sigmoid_1 = Sigmoid()
linear_2 = Linear(5, 3)
sigmoid_2 = Sigmoid()
mseLoss = MeanSquareError()

#forward
linear_1_out = linear_1.forward(x)
sigmoid_1_out = sigmoid_1.forward(linear_1_out)
linear_2_out = linear_2.forward(sigmoid_1_out)
sigmoid_2_out = sigmoid_2.forward(linear_2_out)
loss_out = mseLoss.forward(sigmoid_2_out, y)

# backprop

_ = linear_1.backward(x, sigmoid_1.backward(linear_2.backward(sigmoid_1_out, sigmoid_2.backward(mseLoss.backward()))))

gradWeight = linear_2.gradWeight
gradBias = linear_2.gradBias

approxGradWeight = np.zeros_like(linear_2.weight)
approxGradBias = np.zeros_like(linear_2.bias)

EPSILON = 1e-4

updatedWeight = Linear(4, 5)
updatedLinear = Linear(5, 3)

for i in range(linear_2.weight.shape[0]):
    for j in range(linear_2.weight.shape[1]):
        fw = mseLoss.forward(sigmoid_2.forward(linear_2.forward(sigmoid_1_out)), y)
        updatedWeight = np.copy(linear_2.weight)
        updatedWeight[i, j] = updatedWeight[i, j] + EPSILON
        updatedLinear.bias = linear_2.bias
        updatedLinear.weight = updatedWeight
        fw_epsilon = mseLoss.forward(sigmoid_2.forward(updatedLinear.forward(sigmoid_1_out)), y) # Loss function
        approxGradWeight[i, j] = (fw_epsilon - fw) / EPSILON

print 'Weight check for W2:'
print 'gradWeight: \n' , gradWeight
print 'approxGradWeight: \n' , approxGradWeight


gradWeight = linear_1.gradWeight
gradBias = linear_1.gradBias

approxGradWeight = np.zeros_like(linear_1.weight)
approxGradBias = np.zeros_like(linear_1.bias)

updatedLinear = Linear(4, 5)

for i in range(linear_1.weight.shape[0]):
    for j in range(linear_1.weight.shape[1]):
        fw = mseLoss.forward(sigmoid_2.forward(linear_2.forward(sigmoid_1.forward(linear_1.forward(x)))),y)
        updatedWeight = np.copy(linear_1.weight)
        updatedWeight[i, j] = updatedWeight[i, j] + EPSILON
        updatedLinear.weight = updatedWeight
        updatedLinear.bias = linear_1.bias
        fw_epsilon = mseLoss.forward(sigmoid_2.forward(linear_2.forward(sigmoid_1.forward(updatedLinear.forward(x)))),y)
        approxGradWeight[i, j] = (fw_epsilon - fw) / EPSILON


print 'Weight check for W1:'
print 'gradWeight: \n' , gradWeight
print 'approxGradWeight: \n' , approxGradWeight

Weight check for W2:
gradWeight: 
[[-0.82372958 -1.12723936 -1.46004099]
 [-0.68844974 -0.94211457 -1.2202607 ]
 [-0.66832853 -0.91457954 -1.18459634]
 [-0.66740003 -0.91330893 -1.1829506 ]
 [-0.6992864  -0.95694408 -1.23946842]]
approxGradWeight: 
[[-0.8237277  -1.12723747 -1.46003865]
 [-0.68844843 -0.94211325 -1.22025906]
 [-0.66832729 -0.9145783  -1.1845948 ]
 [-0.6673988  -0.91330769 -1.18294906]
 [-0.69928505 -0.95694272 -1.23946673]]
Weight check for W1:
gradWeight: 
[[ 0.01836199 -0.01218004 -0.04033382 -0.01765361  0.03599819]
 [ 0.02981861 -0.01977955 -0.06549936 -0.02866825  0.05845859]
 [ 0.27025078 -0.17926521 -0.59363103 -0.25982488  0.52981946]
 [ 0.04182453 -0.02774342 -0.09187147 -0.04021099  0.08199587]]
approxGradWeight: 
[[ 0.01836152 -0.01218002 -0.04033387 -0.01765363  0.03599805]
 [ 0.02981739 -0.01977949 -0.06549951 -0.02866832  0.05845823]
 [ 0.27014992 -0.17925976 -0.59364326 -0.25983068  0.52978902]
 [ 0.04182211 -0.0277433  -0.09187177 -0.04021114  0.0819951

# 3. Other Activations


In [24]:
#ReLU Activation
class ReLU:
    
    def forward(self, x):
        self.current_x = x
        return np.maximum(x,0)

    def backward(self, gradOutput):
        return np.multiply(gradOutput, self.current_x > 0)

# TanH Activation
class TanH:
    
    def forward(self, x):
#        self.current_tanh = np.tanh(x)
        self.current_tanh = np.divide(np.exp(x) - np.exp(-x), np.exp(x) + np.exp(-x))
        return self.current_tanh
    
    def backward(self, gradOutput):
        return np.multiply(gradOutput, (1.0 - np.power(self.current_tanh, 2)))

<h3>Training with ReLU </h3>

In [23]:
model = SimpleModel(trainModel)
model.add(Linear(4,3), 'linear')
model.add(ReLU(), 'sigmoid') ## Implementing ReLU,, but keeping the name 'sigmoid'
model.add(MeanSquareError(), 'mse_loss')

model.run(x, y, epochs=400, batchSize=1, learningRate=0.001, shuffle=False)

Creating PyDeepTensorNet Model
linear Added
sigmoid Added
mse_loss Added
Running  Stochastic Gradient Descent on 1000 samples
Epoch  0 loss:  1.49054789639
Epoch  10 loss:  1.4880998739
Epoch  20 loss:  1.48713108316
Epoch  30 loss:  1.48700853634
Epoch  40 loss:  1.48698571405
Epoch  50 loss:  1.48675250661
Epoch  60 loss:  1.48660799887
Epoch  70 loss:  1.4865043128
Epoch  80 loss:  1.48649211958
Epoch  90 loss:  1.48645122354
Epoch  100 loss:  1.48643991898
Epoch  110 loss:  1.4864191085
Epoch  120 loss:  1.4864049491
Epoch  130 loss:  1.48639495823
Epoch  140 loss:  1.48638666542
Epoch  150 loss:  1.48639453329
Epoch  160 loss:  1.48631694196
Epoch  170 loss:  1.48631583564
Epoch  180 loss:  1.48631400556
Epoch  190 loss:  1.48637253347
Epoch  200 loss:  1.48637827597
Epoch  210 loss:  1.48629963161
Epoch  220 loss:  1.48637122694
Epoch  230 loss:  1.48632844441
Epoch  240 loss:  1.48636992592
Epoch  250 loss:  1.48633369436
Epoch  260 loss:  1.48636963005
Epoch  270 loss:  1.48635

<h3> Training with TanH</h3>

In [35]:
model = SimpleModel(trainModel)
model.add(Linear(4,3), 'linear')
model.add(TanH(), 'sigmoid') ## Implementing TanH,, but keeping the name 'sigmoid'
model.add(MeanSquareError(), 'mse_loss')

model.run(x, y, epochs=400, batchSize=1, learningRate=0.001, shuffle=False)

Creating PyDeepTensorNet Model
linear Added
sigmoid Added
mse_loss Added
Running  Stochastic Gradient Descent on 1000 samples
Epoch  0 loss:  1.55934265573
Epoch  10 loss:  1.55835561261
Epoch  20 loss:  1.55826034117
Epoch  30 loss:  1.55824631474
Epoch  40 loss:  1.55824336879
Epoch  50 loss:  1.55824261156
Epoch  60 loss:  1.55824240139
Epoch  70 loss:  1.55824234161
Epoch  80 loss:  1.55824232447
Epoch  90 loss:  1.55824231955
Epoch  100 loss:  1.55824231813
Epoch  110 loss:  1.55824231772
Epoch  120 loss:  1.55824231761
Epoch  130 loss:  1.55824231757
Epoch  140 loss:  1.55824231756
Epoch  150 loss:  1.55824231756
Epoch  160 loss:  1.55824231756
Epoch  170 loss:  1.55824231756
Epoch  180 loss:  1.55824231756
Epoch  190 loss:  1.55824231756
Epoch  200 loss:  1.55824231756
Epoch  210 loss:  1.55824231756
Epoch  220 loss:  1.55824231756
Epoch  230 loss:  1.55824231756
Epoch  240 loss:  1.55824231756
Epoch  250 loss:  1.55824231756
Epoch  260 loss:  1.55824231756
Epoch  270 loss:  1.5

<b>Comments:</b> Stuck at a minima after the 140<sup>th</sup> epoch

# 4. Other Loss functions

In [55]:
class AbsoluteError:
    
    def forward(self, predictions, labels):
        self.current_preds = predictions
        self.current_labels = labels
        return np.mean(np.abs(predictions - labels))

    def backward(self):
        # E = abs(a-y)
        # dE/da = {+1, if a-y> 0, -1 if a-y < 0, nan at 0}
        diff = np.array(self.current_preds-self.current_labels)
        diff[diff>0]=1
        diff[diff<0]=-1
        return diff
    
class BinaryCrossEntropy:

    def forward(self, predictions, labels):
        self.current_preds = predictions
        self.current_labels = labels
        self.num_of_samples = labels.shape[0]
        return -(labels*np.log(predictions) + (1-labels)*np.log(1-predictions)).sum()/self.num_of_samples
    
    def backward(self):
        # E = -[y.log(a) + (1-y).log(1-a)]
        # dE/da = -[y/a - (1-y)/(1-a)]
        #       = (a-y)/a(1-a)
        _f_u = self.num_of_samples * np.divide(self.predictions-self.labels, self.predictions*(1-self.predictions))
        return _f_u

<h3>Training using Abs Loss function </h3>

In [52]:
newModel = SimpleModel(trainNewModel)
newModel.add(Linear(4,5),'linear_1')
newModel.add(Sigmoid(),'sigmoid_1')
newModel.add(Linear(5,3), 'linear_2')
newModel.add(Sigmoid(), 'sigmoid_2')
newModel.add(AbsoluteError(), 'mse_loss') # Using Abs error, naming it mse_loss

newModel.run(inputX=x, outputY=y, epochs=400, batchSize=10, learningRate=0.001)

Creating PyDeepTensorNet Model
linear_1 Added
sigmoid_1 Added
linear_2 Added
sigmoid_2 Added
mse_loss Added
Running Mini-batch Stochastic Gradient Descent on 1000 samples
Epoch  0 loss:  0.0708663729028
Epoch  10 loss:  0.0645492414372
Epoch  20 loss:  0.063837252905
Epoch  30 loss:  0.0636777959522
Epoch  40 loss:  0.0636156816463
Epoch  50 loss:  0.0635799829052
Epoch  60 loss:  0.0635599868023
Epoch  70 loss:  0.0635458183301
Epoch  80 loss:  0.0635352080402
Epoch  90 loss:  0.0635270564447
Epoch  100 loss:  0.0635207492342
Epoch  110 loss:  0.0635159228462
Epoch  120 loss:  0.0635117959841
Epoch  130 loss:  0.0635082497265
Epoch  140 loss:  0.0635052861077
Epoch  150 loss:  0.063502686206
Epoch  160 loss:  0.0635004237996
Epoch  170 loss:  0.0634984752512
Epoch  180 loss:  0.0634967354473
Epoch  190 loss:  0.0634952688075
Epoch  200 loss:  0.0634939880562
Epoch  210 loss:  0.0634928219829
Epoch  220 loss:  0.0634917286177
Epoch  230 loss:  0.0634906993648
Epoch  240 loss:  0.063489

# Optional:  Implement Batch Gradient Descent

### 2-Layer model, trained in batches of size 10 

In [13]:
newModel.run(inputX=x, outputY=y, epochs=400, batchSize=10, learningRate=0.001)

Creating PyDeepTensorNet Model
linear_1 Added
sigmoid_1 Added
linear_2 Added
sigmoid_2 Added
mse_loss Added
Running Mini-batch Stochastic Gradient Descent on 1000 samples
Epoch  0 loss:  1.63068731843
Epoch  10 loss:  1.48897432407
Epoch  20 loss:  1.48825396977
Epoch  30 loss:  1.48800106844
Epoch  40 loss:  1.48785425671
Epoch  50 loss:  1.48774060207
Epoch  60 loss:  1.48763066566
Epoch  70 loss:  1.48750077955
Epoch  80 loss:  1.48731269671
Epoch  90 loss:  1.48696574464
Epoch  100 loss:  1.48609689547
Epoch  110 loss:  1.48380015017
Epoch  120 loss:  1.47970468194
Epoch  130 loss:  1.47365082915
Epoch  140 loss:  1.46662785471
Epoch  150 loss:  1.46046235155
Epoch  160 loss:  1.45584529642
Epoch  170 loss:  1.45245722439
Epoch  180 loss:  1.44985073664
Epoch  190 loss:  1.44770304345
Epoch  200 loss:  1.44580957711
Epoch  210 loss:  1.44404715311
Epoch  220 loss:  1.44234869508
Epoch  230 loss:  1.44068875442
Epoch  240 loss:  1.43907301747
Epoch  250 loss:  1.43752727877
Epoch  2

### Example single-layer model, trained in batches of 10

In [30]:
model.run(x, y, epochs=400, batchSize=10, learningRate=0.001, shuffle=False)

Running Mini-batch Stochastic Gradient Descent on 1000 samples
Epoch  0 loss:  1.40991962388
Epoch  10 loss:  1.40613441236
Epoch  20 loss:  1.4027722298
Epoch  30 loss:  1.39984658997
Epoch  40 loss:  1.39728410455
Epoch  50 loss:  1.39503306352
Epoch  60 loss:  1.39305256118
Epoch  70 loss:  1.39130704313
Epoch  80 loss:  1.3897647086
Epoch  90 loss:  1.38839730088
Epoch  100 loss:  1.38718012932
Epoch  110 loss:  1.38609195369
Epoch  120 loss:  1.38511470407
Epoch  130 loss:  1.38423310761
Epoch  140 loss:  1.38343429221
Epoch  150 loss:  1.3827074113
Epoch  160 loss:  1.38204331062
Epoch  170 loss:  1.38143424281
Epoch  180 loss:  1.38087362841
Epoch  190 loss:  1.38035585801
Epoch  200 loss:  1.37987612946
Epoch  210 loss:  1.37943031448
Epoch  220 loss:  1.37901484919
Epoch  230 loss:  1.37862664427
Epoch  240 loss:  1.37826301106
Epoch  250 loss:  1.37792160043
Epoch  260 loss:  1.37760035227
Epoch  270 loss:  1.37729745339
Epoch  280 loss:  1.37701130242
Epoch  290 loss:  1.376

<p><b>Comments:</b> It can be seen that using batches speeds up the convergence of the training.</p>

### Trying out random-sampling for training
### Single Sample at a time

In [32]:
model.run(x, y, epochs=400, batchSize=1, learningRate=0.001, shuffle=True)

Running  Stochastic Gradient Descent on 1000 samples
Epoch  0 loss:  1.38761800322
Epoch  10 loss:  1.42942156931
Epoch  20 loss:  1.4272957911
Epoch  30 loss:  1.38622805864
Epoch  40 loss:  1.41415761276
Epoch  50 loss:  1.38676261014
Epoch  60 loss:  1.41321475101
Epoch  70 loss:  1.39395877606
Epoch  80 loss:  1.39836963994
Epoch  90 loss:  1.40333077509
Epoch  100 loss:  1.42287634218
Epoch  110 loss:  1.4283698032
Epoch  120 loss:  1.39631036789
Epoch  130 loss:  1.42875055558
Epoch  140 loss:  1.39602671236
Epoch  150 loss:  1.38502227728
Epoch  160 loss:  1.39991312008
Epoch  170 loss:  1.4069977729
Epoch  180 loss:  1.40141082978
Epoch  190 loss:  1.39612246034
Epoch  200 loss:  1.40856251505
Epoch  210 loss:  1.41626785692
Epoch  220 loss:  1.36902450558
Epoch  230 loss:  1.44089866394
Epoch  240 loss:  1.39521069621
Epoch  250 loss:  1.41724248797
Epoch  260 loss:  1.36194033613
Epoch  270 loss:  1.42208954534
Epoch  280 loss:  1.41621584672
Epoch  290 loss:  1.39080190325
E

### Taking 10 random samples at a time (random batch of size 10)

In [33]:
model.run(x, y, epochs=400, batchSize=10, learningRate=0.001, shuffle=True)

Running Mini-batch Stochastic Gradient Descent on 1000 samples
Epoch  0 loss:  1.40725059097
Epoch  10 loss:  1.41112266304
Epoch  20 loss:  1.38854570903
Epoch  30 loss:  1.37914962743
Epoch  40 loss:  1.40715187718
Epoch  50 loss:  1.37637128226
Epoch  60 loss:  1.40719156427
Epoch  70 loss:  1.4101033901
Epoch  80 loss:  1.39226346443
Epoch  90 loss:  1.40471213107
Epoch  100 loss:  1.3766723526
Epoch  110 loss:  1.40190021814
Epoch  120 loss:  1.38769168943
Epoch  130 loss:  1.35716558207
Epoch  140 loss:  1.36816255405
Epoch  150 loss:  1.37079817963
Epoch  160 loss:  1.35048676313
Epoch  170 loss:  1.42834480066
Epoch  180 loss:  1.3738737437
Epoch  190 loss:  1.34209541294
Epoch  200 loss:  1.37617470578
Epoch  210 loss:  1.37406859883
Epoch  220 loss:  1.35919854812
Epoch  230 loss:  1.32920991944
Epoch  240 loss:  1.3821722894
Epoch  250 loss:  1.36661350841
Epoch  260 loss:  1.36029899663
Epoch  270 loss:  1.34078426684
Epoch  280 loss:  1.3852981694
Epoch  290 loss:  1.39637

<b>Observation:</b> The training convergence using Random sampling is not really monotonous.

## Gradient Checking for Batch implementation

In [15]:
linear_1 = newModel.layerStack['linear_1']
linear_2= newModel.layerStack['linear_2']
sigmoid_1 = newModel.layerStack['sigmoid_1']
sigmoid_2 = newModel.layerStack['sigmoid_2']
mseLoss = newModel.layerStack['mse_loss']

batchSize = 1000
#'x' and 'y' in this case is a batch of 1000 samples
linear_1_out = linear_1.forward(x) 
sigmoid_1_out = sigmoid_1.forward(linear_1_out)
linear_2_out = linear_2.forward(sigmoid_1_out)
sigmoid_2_out = sigmoid_2.forward(linear_2_out)
loss_out = mseLoss.forward(sigmoid_2_out, y)

# backprop

_ = linear_1.backward(x, sigmoid_1.backward(linear_2.backward(sigmoid_1_out, sigmoid_2.backward(mseLoss.backward()))))

gradWeight = linear_2.gradWeight/batchSize
gradBias = linear_2.gradBias/batchSize

approxGradWeight = np.zeros_like(linear_2.weight)
approxGradBias = np.zeros_like(linear_2.bias)

EPSILON = 1e-4

updatedWeight = Linear(4, 5)
updatedLinear = Linear(5, 3)

for i in range(linear_2.weight.shape[0]):
    for j in range(linear_2.weight.shape[1]):
        fw = mseLoss.forward(sigmoid_2.forward(linear_2.forward(sigmoid_1_out)), y)
        updatedWeight = np.copy(linear_2.weight)
        updatedWeight[i, j] = updatedWeight[i, j] + EPSILON
        updatedLinear.bias = linear_2.bias
        updatedLinear.weight = updatedWeight
        fw_epsilon = mseLoss.forward(sigmoid_2.forward(updatedLinear.forward(sigmoid_1_out)), y) # Loss function
        approxGradWeight[i, j] = (fw_epsilon - fw) / EPSILON

print 'Weight check for W2:\n'
print 'gradWeight: \n' , gradWeight
print 'approxGradWeight: \n' , approxGradWeight

gradWeight = linear_1.gradWeight/batchSize
gradBias = linear_1.gradBias/batchSize

approxGradWeight = np.zeros_like(linear_1.weight)
approxGradBias = np.zeros_like(linear_1.bias)

updatedLinear = Linear(4, 5)

for i in range(linear_1.weight.shape[0]):
    for j in range(linear_1.weight.shape[1]):
        fw = mseLoss.forward(sigmoid_2.forward(linear_2.forward(sigmoid_1.forward(linear_1.forward(x)))),y)
        updatedWeight = np.copy(linear_1.weight)
        updatedWeight[i, j] = updatedWeight[i, j] + EPSILON
        updatedLinear.weight = updatedWeight
        updatedLinear.bias = linear_1.bias
        fw_epsilon = mseLoss.forward(sigmoid_2.forward(linear_2.forward(sigmoid_1.forward(updatedLinear.forward(x)))),y)
        approxGradWeight[i, j] = (fw_epsilon - fw) / EPSILON


print '\nWeight check for W1:\n'
print 'gradWeight: \n' , gradWeight
print 'approxGradWeight: \n' , approxGradWeight

Weight check for W2:

gradWeight: 
[[ 0.05271095  0.36303113  0.08622958]
 [ 0.02351996  0.2878376   0.36617409]
 [ 0.03457675  0.30030316  0.27429531]
 [ 0.02623346  0.29003086  0.34441089]
 [ 0.08507576  1.23521372 -0.05303475]]
approxGradWeight: 
[[ 0.05271167  0.36312971  0.08633201]
 [ 0.02351967  0.28811941  0.36624498]
 [ 0.03457664  0.30057402  0.2743752 ]
 [ 0.02623321  0.29031064  0.34448307]
 [ 0.08507863  1.2353177  -0.0527599 ]]

Weight check for W1:

gradWeight: 
[[-0.29137329  0.46947352  0.17792248  0.40948762 -2.20020529]
 [ 0.1260089   0.03470236  0.09857441  0.03926862 -1.37334271]
 [-0.61088509  0.34243408  0.06516932  0.28271924 -2.50210997]
 [-0.57915531  0.04761378 -0.14647811  0.01072872 -2.31327458]]
approxGradWeight: 
[[-0.28891018  0.47329209  0.18037311  0.41293373 -2.18378859]
 [ 0.12633085  0.03556904  0.09912678  0.04005361 -1.3730511 ]
 [-0.60807715  0.34678807  0.06792259  0.28659999 -2.48648126]
 [-0.57749569  0.04862766 -0.1458042   0.01163421 -2.2990

<p>The weights are indeed correctly updated even for batch implementation</p>