In [None]:
from __future__ import print_function
import numpy as np
np.random.seed(42)

In [None]:
#Загружаем MNIST
import pickle
with open("mnist.dump","rb") as f:
    mnist = pickle.load(f, encoding = "latin1")

In [None]:
X = mnist.data
y = mnist.target

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [None]:
y_train_transformed = []
for number in y_train:
    y_train_transformed.append(np.eye(10)[int(number)])
    
y_test_transformed = []
for number in y_test:
    y_test_transformed.append(np.eye(10)[int(number)])
    
y_train_transformed = np.array(y_train_transformed)
y_test_transformed = np.array(y_test_transformed)

In [None]:
#Abstract Class
class Layer:
    def __init__ (self):
        pass
    
    def forward(self, input):
        return input

    def backward(self,input, gradient_output):
        return np.dot(grad_output, np.eye(input.shape[1]))

In [None]:
class Sigmoid(Layer):
    def __init__(self):
        pass
    
    def forward(self, input):
        # Применить sigmoid ко всем элементам input
        return 1. / (1. + np.exp(-input))
    
    def backward(self, input, gradient_output):
        #Вычисляем градиент в точке входа ( по правилу производной сложной функции)
        return self.forward(input) * (1 - self.forward(input)) 

In [None]:
class Dense(Layer):
    def __init__(self, input_units, output_units, learning_rate=0.1):
        self.learning_rate = learning_rate
        self.input_units = input_units
        self.output_units = output_units
        # Инициализируем веса случайными числами из нормального распределения
        self.weights = np.random.uniform(-50, 50, (input_units, output_units))
        self.biases = np.ones(output_units)
        
    def forward(self,input):
        #Применм веса к входу по формуле Wx+b, где x - это input
        return np.dot(input, self.weights) + self.biases
    
    def backward(self, input, gradient_output):
        # Вычисляем df/d(input) = df/d(layer)*d(Layer)/d(input)
        # d(Layer)/d(input) = Транспонированные веса 
        gradient_input = np.dot(gradient_output, self.weights.T)
        
        # Вычисляем градиент по весам
        # Мы получили из предыдущего шага как должны измениться выходы на этого слоя. Это df/d(Layer)
        # Производная d(Layer)/d(Weights) = (input)T
        # Применяем ChainRule
        gradient_weights = np.dot(input.T, gradient_output)
        gradient_biases = np.reshape(np.dot(np.ones((input.shape[0], 1)).T, gradient_output), (self.output_units))
        
        assert gradient_weights.shape == self.weights.shape and gradient_biases.shape == self.biases.shape
        
        self.weights = self.weights - self.learning_rate*gradient_weights
        self.biases = self.biases - self.learning_rate*gradient_biases
        return gradient_input

In [None]:
def softmax_crossentropy_with_logits(logits, reference_answers):
    logits_for_answers = logits[np.arange(len(logits)), [list(i).index(1) for i in reference_answers]]
    xentropy = - logits_for_answers + np.log(np.sum(np.exp(logits), axis=-1))
    return xentropy

def grad_softmax_crossentropy_with_logits(logits,reference_answers):
    ones_for_answers = np.zeros_like(logits)
    ones_for_answers[np.arange(len(logits)), [list(i).index(1) for i in reference_answers]] = 1
    softmax = np.exp(logits) / np.exp(logits).sum(axis=-1,keepdims=True)
    return - ones_for_answers + softmax

In [None]:
network = []
network.append(Dense(X_train.shape[1],64, 0.0001))
network.append(Sigmoid())
network.append(Dense(64,128, 0.0000001))
network.append(Sigmoid())
network.append(Dense(128,10, 0.1))

In [None]:
def forward(network, X):
    activations = []
    input = X
   
    activations.append(network[0].forward(input))
    for layer in network[1:]:
        activations.append(layer.forward(activations[-1]))
        
    assert len(activations) == len(network)
    return activations

def predict(network, X):
    logits = forward(network, X)[-1]
    return logits.argmax(axis=-1)

def train(network, X, y):
    layer_activations = forward(network,X)
    layer_inputs = [X]+layer_activations  #layer_input[i] is an input for network[i]
    logits = layer_activations[-1]
    
    # Вычисляем ошибку и градиент ошибки
    loss = softmax_crossentropy_with_logits(logits, y)
    loss_grad = grad_softmax_crossentropy_with_logits(logits, y)
    
    # Выполняем обратно распространение ошибки
    for i in range(len(network)-1,-1,-1):
        loss_grad = network[i].backward(layer_inputs[i], loss_grad)
        
    return np.mean(loss)

In [None]:
from tqdm import trange
from tqdm import tqdm_notebook
def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
    assert len(inputs) == len(targets)
    if shuffle:
        indices = np.random.permutation(len(inputs))
    for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):
        if shuffle:
            excerpt = indices[start_idx:start_idx + batchsize]
        else:
            excerpt = slice(start_idx, start_idx + batchsize)
        yield inputs[excerpt], targets[excerpt]

In [None]:
from sklearn.metrics import accuracy_score
for i in range(3):
    for X_batch, y_batch in iterate_minibatches(X_train, y_train_transformed, 50, True):
        train(network, X_batch, y_batch)
    y_mnist_train_pred = predict(network, X_train)
    y_mnist_test_pred = predict(network, X_test)
    print(str(accuracy_score(y_train, y_mnist_train_pred)) + ',', accuracy_score(y_test, y_mnist_test_pred))