In [10]:
import numpy as np
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
from torchvision.datasets import MNIST
from abc import ABC, abstractmethod

Nesse notebook, será construído uma rede neural à mão que consegue reconhecer as imagens dos números do conjunto de dados [MNIST](https://en.wikipedia.org/wiki/MNIST_database). Foram utilizadas as mesmas classes e funções do notebook [XOR à mão](xor_from_scratch.ipynb).

In [11]:
class Function(ABC):
    """
    Classe abstrata para as funções de ativação e de loss.
    Exige que suas subclasses possuam uma função f e a derivada dessa função f_prime
    """
    @abstractmethod
    def f(self):
        pass
    @abstractmethod
    def f_prime(self):
        pass

In [12]:
class Sigmoid(Function):
    def f(self, x):
        return 1 / (1 + np.exp(-x))
    def f_prime(self, x):
        return self.f(x) * (1-self.f(x))

In [13]:
class ReLU(Function):
    def f(self, x):
        return (x > 0) * x
    def f_prime(self, x):
        return (x > 0) * 1

In [14]:
class MSELoss(Function):
    def f(self, y_hat, y):
        return np.sum((y_hat - y)**2)/len(y_hat)
    def f_prime(self, y_hat, y):
        return 2*(y_hat - y)

In [15]:
class Layer:
    def __init__(self, n_of_inputs: int, n_of_neurons: int , activation: Function, bias: float=0.0):
        self.n_of_inputs = n_of_inputs
        self.n_of_neurons = n_of_neurons
        self.activation = activation
        self.bias = np.ones((1, n_of_neurons)) * bias # bias, inicializado como 0 por padrão
        self.weights = np.random.uniform(-1, 1, (n_of_inputs, n_of_neurons)) # matriz de pesos 
        
        # As variáveis abaixo são necessárias para o backward
        self.weight_gradient = None  # vetor de gradiente dos pesos
        self.bias_gradient = None # vetor de gradiente do bias
        self.layer_inputs = None # output da camada anterior, ou as entradas da rede caso for a primeira camada
        self.linear_output = None # resultado antes de ser aplicada a função de ativação -> linear_output = a @ w + b

    def forward(self, x):
        """
        Forward propagation da camada
        """
        # Shapes:
        # Primeira para a segunda camada: (1, 2) @ (2, 2) = (1, 2)
        # Segunda para a terceira camada: (1, 2) @ (2, 1) = (1, 1)
        self.layer_inputs = x 
        dot_product = self.layer_inputs @ self.weights 
        self.linear_output = dot_product + self.bias
        output = self.activation.f(self.linear_output)
        return output

    def backward(self, chain_rule_derivatives):
        """
        Cálculo dos gradientes da camada. 
        É calculada as derivadas em relação a matriz de pesos e o bias da camada (dC_dw e dC_db), e a 
        derivada em relação ao linear_output (dC_da), para que possa mandar essa derivada para trás para calcular
        o gradiente dos pesos das camadas anteriores, conforme o diagrama
        Parâmetros:
        chain_rule_derivatives - derivada calculada através da regra da cadeia, que foi mandada da camada seguinte (dC_da1)
        Retorno:
        chain_rule_derivatives - derivada calculada através da regra da cadeia, para ser mandada para a camada anterior (dc_da0)
        """
        da1_dz = self.activation.f_prime(self.linear_output) 
        dz_dw = self.layer_inputs
        dz_da0 = self.weights
        
        dC_dw = dz_dw.T @ (da1_dz * chain_rule_derivatives) 
        dC_db = 1 * da1_dz * chain_rule_derivatives
        dC_da0 = (chain_rule_derivatives * da1_dz) @ dz_da0.T
        
        chain_rule_derivatives = dC_da0
        self.weight_gradient = dC_dw
        self.bias_gradient = dC_db
        
        return chain_rule_derivatives

In [16]:
class NeuralNetwork:
    def __init__(self, input_size, lr):
        self.layers = []
        self.input_size = input_size
        self.lr = lr

    def forward(self, x):
        """
        Forward propagation da rede
        """
        for layer in self.layers:
            x = layer.forward(x)
        return x

    def backward(self, loss_derivative):
        """
        Backward propagation da rede.
        Calcula os gradientes e aplica o algoritmo de gradiente descendente para atualizar os pesos e os bias
        """
        # Cálculo dos gradientes
        chain_rule_derivatives = loss_derivative
        for layer in reversed(self.layers):
            chain_rule_derivatives = layer.backward(chain_rule_derivatives)
        
        # Gradiente descendente
        for layer in self.layers:
            layer.weights -= layer.weight_gradient * self.lr
            layer.bias -= layer.bias_gradient * self.lr

    # Faz o forward chamando o objeto, passando os inputs como parâmetro, da mesma forma que o PyTorch faz
    def __call__(self, inputs):
        return self.forward(inputs)

    def append_layer(self, output_number: int, activation, bias: float=0.0):
        """
        Dado um número de saída adiciona uma camada ao fim da rede neural
        Ex: nn = NeuralNetwork(...)
          nn.append_layer(...)
          nn.append_layer(...)
          ...
        """
        # Caso seja a primeira camada
        if len(self.layers) == 0:
            new_layer_input = self.input_size
        else:
            new_layer_input = self.layers[-1].n_of_neurons

        self.layers.append(Layer(new_layer_input, output_number, activation, bias))

In [17]:
# Cria vetor one hot mnist
def one_hot(value: int):
    one_hot_vec = np.zeros((1, 10))
    one_hot_vec[0][value] = 1
    return one_hot_vec

In [19]:
# Downloading MNIST dataset
data_path='/data/mnist'
mnist_train = MNIST(data_path, train=True, transform=transforms.ToTensor(),)
mnist_test = MNIST(data_path, train=False, transform=transforms.ToTensor(),)
num_classes = 10  # MNIST has 10 output classes
train_set = mnist_train.data.numpy()/255
train_targets = [one_hot(t) for t in mnist_train.targets]
test_set = mnist_test.data.numpy()/255
test_targets = [one_hot(t) for t in mnist_test.targets]
print(f"Size of train set is {len(train_set)}")
print(f"Size of test set is {len(test_set)}")
lr = 0.003
nn = NeuralNetwork(28*28, lr)
nn.append_layer(64, activation=Sigmoid(), bias=1)
nn.append_layer(64, activation=Sigmoid(), bias=1)
nn.append_layer(10, activation=Sigmoid(), bias=1)
criterion = MSELoss()

for epoch in range(10):
    total_loss = 0
    for i, (x, y) in enumerate(zip(train_set, train_targets)):
        y_hat = nn(x.flatten().reshape(1,-1))
        loss = criterion.f(y_hat, y)
        loss_derivative = criterion.f_prime(y_hat, y)
        nn.backward(loss_derivative)
        total_loss += loss
        if i % 20000 == 0 and i > 0:
            print(f"[{epoch}, {i+1:5d}] Accumulated Loss: {total_loss/(20000 * i)}")

    print(f"Loss: {total_loss / len(train_set)} - Epoch: {epoch + 1}")

    # validate
    hits = 0
    for x, y in zip(test_set, test_targets):
        y_hat = nn(x.flatten().reshape(1, -1))
        if np.argmax(y_hat) == np.argmax(y):
            hits += 1
    print(f'\nEpoch Accuracy: {hits/len(test_set)* 100}%\n')


def guess():
    n = np.random.randint(0, len(test_set))
    predicted = nn.forward(test_set[n].flatten().reshape(1, -1))
    actual = test_targets[n]
    print(f"Actual number: {np.argmax(actual)} - Predicted number: {np.argmax(predicted)}")

Size of train set is 60000
Size of test set is 10000
[0, 20001] Accumulated Loss: 8.173645930216533e-05
[0, 40001] Accumulated Loss: 7.590054531862076e-05
Loss: 1.2482032712511677 - Epoch: 1

Epoch Accuracy: 71.85000000000001%

[1, 20001] Accumulated Loss: 1.9677267203508734e-05
[1, 40001] Accumulated Loss: 1.847541873891639e-05
Loss: 0.3444905154554732 - Epoch: 2

Epoch Accuracy: 83.93%

[2, 20001] Accumulated Loss: 1.373712253907718e-05
[2, 40001] Accumulated Loss: 1.3424061316576966e-05
Loss: 0.2567657099762351 - Epoch: 3

Epoch Accuracy: 86.61999999999999%

[3, 20001] Accumulated Loss: 1.1251232598180684e-05
[3, 40001] Accumulated Loss: 1.1181469221829684e-05
Loss: 0.21610109058853308 - Epoch: 4

Epoch Accuracy: 88.53999999999999%

[4, 20001] Accumulated Loss: 9.831764285701646e-06
[4, 40001] Accumulated Loss: 9.862222120517024e-06
Loss: 0.19167733233027642 - Epoch: 5

Epoch Accuracy: 89.4%

[5, 20001] Accumulated Loss: 8.888930773724612e-06
[5, 40001] Accumulated Loss: 8.968867443

In [51]:
guess()

Actual number: 5 - Predicted number: 3
