In [53]:
import torch, torchvision
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda
import matplotlib.pyplot as plt

import numpy as np
import matplotlib.pylab as plt
from sklearn.preprocessing import OneHotEncoder


In [7]:
train_dataset = torchvision.datasets.MNIST('./data', train=True, download=True, transform=ToTensor())
test_dataset = torchvision.datasets.MNIST('./data', train=False, download=True, transform=ToTensor())

In [64]:
def fetch(fp):
    with open(fp, "rb") as f:
        data = f.read()
    return np.frombuffer(data, dtype=np.uint8).copy()

In [68]:
X_train = fetch("./mnist/train-images-idx3-ubyte")[0x10:].reshape((-1, 28 * 28))
Y_train = fetch("./mnist/train-labels-idx1-ubyte")[8:]
X_test = fetch("./mnist/t10k-images-idx3-ubyte")[0x10:].reshape((-1, 28 * 28))
Y_test = fetch("./mnist/t10k-labels-idx1-ubyte")[8:]

In [69]:
def normalize_oneHotEncoding(X_train, Y_train, X_test, Y_test):
    
    oneHotEncoder = OneHotEncoder(sparse=False, categories='auto')
    Y_train = oneHotEncoder.fit_transform(Y_train.reshape(len(Y_train), -1))
    Y_test = oneHotEncoder.transform(Y_test.reshape(len(Y_test), -1))
    
    # Normalize data by diving by 255. All values are in range 0-255
    X_train = X_train / 255.
    X_test = X_test / 255.

    return X_train, Y_train, X_test, Y_test


In [71]:
X_train, Y_train, X_test, Y_test = normalize_oneHotEncoding(X_train, Y_train, X_test, Y_test)



In [72]:
def init_weights():
    W1 = np.random.randn(300, 784) / np.sqrt(784)
    W2 = np.random.randn(10, 300) / np.sqrt(300)
    return W1, W2

In [73]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def softmax(x):
    val = np.exp(x - np.max(x))
    return val / val.sum(axis=0)

def de_sigmoid(x):
    return sigmoid(x) * (1 - sigmoid(x))

def forward(X, w1, w2):
    z1 = w1.dot(X.T)
    i1 = sigmoid(z1)
    z2 = w2.dot(i1)
    i2 = softmax(z2)
    return i1, w1, z1, i2, w2, z2



def backward(X, Y, i1, w1, z1, i2, w2, z2, n=60000):
    dz2 = i2 - Y.T
    dw2 = dz2.dot(i1.T) / n
    dz1 = w2.T.dot(dz2) * de_sigmoid(z1)
    dw1 = dz1.dot(X) / n
    return dw1, dw2

def predict(w1, w2, X, Y):
    i1, w1, z1, i2, w2, z2 = forward(X, w1, w2)
    y_hat = np.argmax(i2, axis=0)
    Y = np.argmax(Y, axis = 1)
    accuracy = (y_hat == Y).mean()
    return accuracy * 100



In [77]:
costs = []
accus = []


def train_model(X, Y, alpha=0.1, epochs=400):
    w1, w2 = init_weights()
    for epoch in range(epochs):
        i1, w1, z1, i2, w2, z2 = forward(X, w1, w2)
        cost = -np.mean(Y*np.log(i2.T))
        dw1, dw2 = backward(X, Y, i1, w1, z1, i2, w2, z2)
        w1 = w1 - alpha * dw1
        w2 = w2 - alpha * dw2
        acc = predict(w1, w2, X, Y)
        if epoch % 100 == 0:
            print("Cost: ", cost, "Train Accuracy:", acc)
        if epoch % 10 == 0:
            costs.append(cost)
            accus.append(acc)
    return w1, w2

In [78]:
w1, w2= train_model(X_train, Y_train)

Cost:  0.23737103272958993 Train Accuracy: 9.098333333333333
Cost:  0.1416979255432799 Train Accuracy: 76.77833333333334
Cost:  0.08834103193720771 Train Accuracy: 82.67833333333333
Cost:  0.06702208291292015 Train Accuracy: 85.10666666666667


In [79]:
test_error = 100 - predict(w1, w2, X_test, Y_test)

In [80]:
test_error


12.620000000000005

In [75]:
train_dataloader = DataLoader(train_dataset, batch_size=32)
test_dataloader = DataLoader(test_dataset, batch_size=32)

In [26]:
# GPU
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print('GPU State:', device)


GPU State: cpu


In [12]:
train_dataset[0][0].squeeze().shape

torch.Size([28, 28])

In [46]:
for i, data in enumerate(train_dataloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        print(inputs)

tensor([[[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]],


        [[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]],


        [[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]],


        ...,


        [[[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0.

In [39]:
class FNN(nn.Module):
    def __init__(self, d1, d2):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_layers = nn.Sequential(
            nn.Linear(28*28, d1),
            nn.Sigmoid(),
            nn.Linear(d1, d2),
            nn.Softmax(),
            nn.Linear(d2, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_layers(x)
        return logits

In [40]:
model = FNN(300, 400)

In [51]:
lr = 1e-5
bs = 64
epochs = 100
loss_fn = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=lr)

In [52]:
for epoch in range(epochs):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(train_dataloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 100 == 99:    # print every 100 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 100:.3f}')
            running_loss = 0.0

print('Finished Training')


[1,   100] loss: -0.946
[1,   200] loss: -0.948
[1,   300] loss: -0.948
[1,   400] loss: -0.947
[1,   500] loss: -0.947
[1,   600] loss: -0.949
[1,   700] loss: -0.946
[1,   800] loss: -0.947
[1,   900] loss: -0.946
[1,  1000] loss: -0.948
[1,  1100] loss: -0.947
[1,  1200] loss: -0.946
[1,  1300] loss: -0.948
[1,  1400] loss: -0.949
[1,  1500] loss: -0.948
[1,  1600] loss: -0.946
[1,  1700] loss: -0.946
[1,  1800] loss: -0.946
[2,   100] loss: -0.948
[2,   200] loss: -0.950
[2,   300] loss: -0.950
[2,   400] loss: -0.949
[2,   500] loss: -0.949
[2,   600] loss: -0.951
[2,   700] loss: -0.948
[2,   800] loss: -0.949
[2,   900] loss: -0.948
[2,  1000] loss: -0.950
[2,  1100] loss: -0.949
[2,  1200] loss: -0.948
[2,  1300] loss: -0.950
[2,  1400] loss: -0.950
[2,  1500] loss: -0.949
[2,  1600] loss: -0.948
[2,  1700] loss: -0.948
[2,  1800] loss: -0.948
[3,   100] loss: -0.950
[3,   200] loss: -0.952
[3,   300] loss: -0.952
[3,   400] loss: -0.950
[3,   500] loss: -0.950
[3,   600] loss:

KeyboardInterrupt: 

In [49]:
correct = 0
total = 0
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
    for data in test_dataloader:
        images, labels = data
        # calculate outputs by running images through the network
        outputs = model(images)
        # the class with the highest energy is what we choose as prediction
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the network on the {len(images)} test images: {100 * correct // total} %')

Accuracy of the network on the 16 test images: 11 %


In [50]:
classes = list(range(10))
# prepare to count predictions for each class
correct_pred = {classname: 0 for classname in classes}
total_pred = {classname: 0 for classname in classes}

# again no gradients needed
with torch.no_grad():
    for data in test_dataloader:
        images, labels = data
        outputs = model(images)
        _, predictions = torch.max(outputs, 1)
        # collect the correct predictions for each class
        for label, prediction in zip(labels, predictions):
            if label == prediction:
                correct_pred[classes[label]] += 1
            total_pred[classes[label]] += 1


# print accuracy for each class
for classname, correct_count in correct_pred.items():
    accuracy = 100 * float(correct_count) / total_pred[classname]
    print(f'Accuracy for class: {classname:5f} is {accuracy:.1f} %')

Accuracy for class: 0.000000 is 0.0 %
Accuracy for class: 1.000000 is 100.0 %
Accuracy for class: 2.000000 is 0.0 %
Accuracy for class: 3.000000 is 0.0 %
Accuracy for class: 4.000000 is 0.0 %
Accuracy for class: 5.000000 is 0.0 %
Accuracy for class: 6.000000 is 0.0 %
Accuracy for class: 7.000000 is 0.0 %
Accuracy for class: 8.000000 is 0.0 %
Accuracy for class: 9.000000 is 0.0 %
