In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
from tqdm import tqdm
import warnings
warnings.simplefilter('ignore')

In [None]:
import torch
import torchvision

# Практика: Погружение в глубокое обучение
В семинаре, будем использовать набор данных `fashion_mnist`, загрузим их

In [None]:
from torchvision import datasets,transforms

In [None]:
trainset = datasets.FashionMNIST('~/.pytorch/F_MNIST_data/', train = True, download=True)

In [None]:
testset = datasets.FashionMNIST('~/.pytorch/F_MNIST_data/', train = False, download = True)

In [None]:
num_classes = len(trainset.classes)

In [None]:
x_train = trainset.train_data
y_train = trainset.train_labels

x_test = testset.train_data
y_test = testset.train_labels

In [None]:
fig = plt.figure(figsize=(15,5))
for i in range(num_classes):
    ax = fig.add_subplot(2, 5, 1 + i, xticks=[], yticks=[])
    idx = np.where(y_train[:]==i)[0]
    features_idx = x_train[idx,::]
    img_num = np.random.randint(features_idx.shape[0])
    im = features_idx[img_num]
    ax.set_title(trainset.classes[i])
    plt.imshow(im, cmap='gray_r')
plt.show()

### Проведем небольшие предобработки

In [None]:
x_train_flat = x_train.flatten(start_dim=1).float()
x_test_flat = x_test.flatten(start_dim=1).float()
print(f'Была размерность: {x_train.shape}, стала: {x_train_flat.shape}')
print(f'Была размерность: {x_test.shape}, стала: {x_test_flat.shape}')

In [None]:
D_out = num_classes
D_in = x_train_flat.shape[1]

In [None]:
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, 128),
    torch.nn.Sigmoid(),
    torch.nn.Linear(128, 10),
    torch.nn.Sigmoid(),
    torch.nn.Linear(10, D_out),
    torch.nn.Softmax()
)

In [None]:
# import torch.nn.functional as F

In [None]:
"""class Model(torch.nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.fc1 = torch.nn.Linear(28 * 28, 200)
        self.fc2 = torch.nn.Linear(200, 200)
        self.fc3 = torch.nn.Linear(200, 10)
    def forward(self, x):
        x = F.sigmoid(self.fc1(x))
        x = F.sigmoid(self.fc2(x))
        x = self.fc3(x)
        return F.softmax(x)
model = Model()
   """

In [None]:
model

In [None]:
y_pred = model(x_train_flat)

In [None]:
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
learning_rate = 1e-2


# Forward pass: compute predicted y by passing x to the model. Module objects
# override the __call__ operator so you can call them like functions. When
# doing so you pass a Tensor of input data to the Module and it produces
# a Tensor of output data.
y_pred = model(x_train_flat.float())

# Compute and print loss. We pass Tensors containing the predicted and true
# values of y, and the loss function returns a Tensor containing the
# loss.
loss_old = loss_fn(y_pred, y_train)
acc_old = accuracy_score(y_train.numpy(), y_pred.argmax(dim=1).numpy())

# Zero the gradients before running the backward pass.
model.zero_grad()

# Backward pass: compute gradient of the loss with respect to all the learnable
# parameters of the model. Internally, the parameters of each Module are stored
# in Tensors with requires_grad=True, so this call will compute gradients for
# all learnable parameters in the model.
loss_old.backward()

# Update the weights using gradient descent. Each parameter is a Tensor, so
# we can access its gradients like we did before.
with torch.no_grad():
    for param in model.parameters():
        param -= learning_rate * param.grad

y_pred = model(x_train_flat.float())
loss_new = loss_fn(y_pred, y_train)
step = loss_new.item()-loss_old.item()

acc_new = accuracy_score(y_train.numpy(), y_pred.argmax(dim=1).numpy())



print(f'Лосс: {loss_old.item()} -> {loss_new.item()}. Step {step} ')
print(f'Accuracy: {acc_old} -> {acc_new}')

In [None]:
def batch_train(model, learning_rate, x, y):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    model.zero_grad()
    loss.backward()
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad
    return(loss.item())

In [None]:
def train(model, n_epochs, batch_size, learning_rate,  X, y, X_test, y_test):
    acc_train_all = []
    loss_train_all = []
    acc_test_all = []
    loss_test_all = []

    for epoch in range(n_epochs):

        permutation = torch.randperm(X.size()[0])

        for i in tqdm(range(0,X.float().size()[0], batch_size)):
            indices = permutation[i:i+batch_size]
            batch_x, batch_y = X[indices], y[indices]
            batch_train(model, learning_rate, batch_x, batch_y)

        y_test_pred = model(X_test)
        y_train_pred = model(X)


        acc_train = accuracy_score(y.numpy(), y_train_pred.argmax(dim=1).numpy())
        loss_train = loss_fn(y_train_pred, y).detach().numpy() 
        acc_test = accuracy_score(y_test.numpy(), y_test_pred.argmax(dim=1).numpy())
        loss_test = loss_fn(y_test_pred, y_test).detach().numpy()

        acc_train_all = np.append(acc_train_all, acc_train)
        loss_train_all = np.append(loss_train_all, loss_train)
        acc_test_all = np.append(acc_test_all, acc_test)
        loss_test_all = np.append(loss_test_all, loss_test)


        print(f'Epoch {epoch}: \n Accuracy - train: {acc_train} | test: {acc_test} \n Loss - train: {loss_train} | test: {loss_test}')
        
    return(acc_train_all, loss_train_all, acc_test_all, loss_test_all)

In [None]:
n_epochs = 100
batch_size = 1000 
learning_rate = 1e-1

acc_train_all, loss_train_all, acc_test_all, loss_test_all = train(model, n_epochs, batch_size, learning_rate, x_train_flat, y_train, x_test_flat, y_test)


In [None]:
def vis_history(acc_train_all, loss_train_all, acc_test_all, loss_test_all):
    fig = plt.figure(figsize=(16, 4))

    plt.subplot(1, 2, 1)

    plt.plot(loss_train_all, label='loss')
    plt.plot(loss_test_all, label='val_loss')

    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(acc_train_all, label='acc')
    plt.plot(acc_test_all, label='val_acc')

    plt.legend()
    plt.show()

In [None]:
vis_history(acc_train_all, loss_train_all, acc_test_all, loss_test_all)

#### Задание №1:  
Измените сеть выше, добавив еще один скрытый слой `Linear()` с 32 нейронами и перезапустите обучение

In [None]:
# Ваш код здесь

## Что мы можем улучшить? 
- Отнормировать признаки
- Заменить сигмоиды на ReLu
- Задать правила инициации весов

### Нормирование
<img src='normalize.png'>

In [None]:
# Ваш код здесь
x_train_norm = (x_train_flat/255)*2-1
x_test_norm = (x_test_flat/255)*2-1

In [None]:
x_train_norm.max(), x_train_norm.min()

### Функции активации
<img src='activations.png'>

### Инициациия весов
__Случайно__  
$ w = a * random$, но тогда если $a \gg 1$, то на выходе $b\gg1$ и если $a \ll 1 $, то $b \approx 0 $  

__Xavier__  
$a = \frac{1}{\sqrt{n}}$, где $n$ - кол-во нейронов на входе

__He__  
$a = \frac{1}{\sqrt{\frac{n}{2}}}$, где $n$ - кол-во нейронов на входе

In [None]:
def init_weights(m):
    if type(m) == torch.nn.Linear:
        torch.nn.init.xavier_uniform(m.weight)
        m.bias.data.fill_(0.01)

#### Задание №2:   

1. Создайте новую сеть `model_2`:
 - 128 нейронов с функцией активации ReLU: 
 - 64 нейрона  с функцией активации ReLU:
 - 32 нейрона с функцией активации ReLU:
 - Softmax выход

2. Примените к модели  функцию инициации весов с помощью метода .apply()

In [None]:
model_2 =  # Ваш Код здесь

In [None]:
n_epochs = 100
batch_size = 2028 
learning_rate = 1e-2

acc_train_all, loss_train_all, acc_test_all, loss_test_all = train(model_2, n_epochs, batch_size, learning_rate,
                                                                   x_train_norm, y_train, x_test_norm, y_test)


In [None]:
vis_history(acc_train_all, loss_train_all, acc_test_all, loss_test_all)

## Влияние скорости обучения
Посмотрим, как влияет параметр `learning_rate` на качество нашей модели на обучающей выборке

In [None]:
learning_rates = [1e+1, 1e-2, 1e-3, 1e-5, 1e-10] 

In [None]:
voc_loss_train = {}
batch_size = 5000

for i in learning_rates:
    model_2.apply(init_weights)
    acc_train_all, loss_train_all, acc_test_all, loss_test_all = train(model_2, 25, batch_size, i,
                                                                       x_train_flat, y_train, x_test_flat, y_test)
    voc_loss_train[i] = loss_train_all

In [None]:
fig = plt.figure(figsize=(16, 4))

for i in voc_loss_train.keys():
    plt.plot(voc_loss_train[i], label=f'{i}')


plt.legend()
plt.show()

## Влияние метода оптимизации градиентного спуска

<img src='optimizers7.gif'>

#### Momentum
Вместо того, чтобы использовать только градиент текущего шага, мы будем накапливать импульс градиента прошлых шагов для определения направления движения. 
В связи со стохастической природой, обновления градиента происходят "зигзагообразно", с помощью момента мы усиливаем движение вдоль основного направления. На практике коэффициент у момента инициализируется на уровне 0,5 и постепенно увеличивается до 0,9 в течение нескольких эпох. 
  
#### RMSProp (Root Mean Square Propogation)   
Мы обновляяем меньше веса, которые слишком часто обновляются, и будем использовать усреднённый по истории квадрат градиента.

#### Adam (Adaptive moment estimation)
Cочетает в себе и идею накопления движения и идею более слабого обновления весов для типичных признаков

In [None]:
optimizer = torch.optim.SGD(model_2.parameters(), lr=0.001, momentum=0.0)

In [None]:
def batch_train(model, x, y):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    model.zero_grad()
    loss.backward()
    #    with torch.no_grad():
    #    for param in model.parameters():
    #        param -= learning_rate * param.grad
    optimizer.step()
    return(loss.item())

In [None]:
n_epochs = 100
batch_size = 1000


model_2.apply(init_weights)
loss_train_sgd = []

optimizer = torch.optim.SGD(model_2.parameters(), lr=0.001, momentum=0.0)

for epoch in range(n_epochs):

    permutation = torch.randperm(x_train_norm.size()[0])

    for i in tqdm(range(0,x_train_norm.float().size()[0], batch_size)):
        indices = permutation[i:i+batch_size]
        batch_x, batch_y = x_train_norm[indices], y_train[indices]
        batch_train(model_2, batch_x, batch_y)

    y_test_pred = model_2(x_test_norm)
    loss_train = loss_fn(y_test_pred, y_test).detach().numpy()
    print(f'Epoch: {epoch} loss {loss_train}')
    loss_train_sgd = np.append(loss_train_sgd, loss_train)

    
model_2.apply(init_weights)
loss_train_sgd_moment = []

optimizer = torch.optim.SGD(model_2.parameters(), lr=0.001, momentum=0.9)

for epoch in range(n_epochs):

    permutation = torch.randperm(x_train_norm.size()[0])

    for i in tqdm(range(0,x_train_norm.float().size()[0], batch_size)):
        indices = permutation[i:i+batch_size]
        batch_x, batch_y = x_train_norm[indices], y_train[indices]
        batch_train(model_2, batch_x, batch_y)

    y_test_pred = model_2(x_test_norm)
    loss_train = loss_fn(y_test_pred, y_test).detach().numpy()
    print(f'Epoch: {epoch} loss {loss_train}')
    loss_train_sgd_moment = np.append(loss_train_sgd_moment, loss_train)
    
    

model_2.apply(init_weights)
optimizer = torch.optim.Adam(model_2.parameters(), lr=0.001)
loss_train_adam = []


for epoch in range(n_epochs):

    permutation = torch.randperm(x_train_norm.size()[0])

    for i in tqdm(range(0,x_train_norm.float().size()[0], batch_size)):
        indices = permutation[i:i+batch_size]
        batch_x, batch_y = x_train_norm[indices], y_train[indices]
        batch_train(model_2, batch_x, batch_y)

    y_test_pred = model_2(x_test_norm)
    loss_train = loss_fn(y_test_pred, y_test).detach().numpy()
    print(f'Epoch: {epoch} loss {loss_train}')
    loss_train_adam = np.append(loss_train_all, loss_train)

In [None]:
fig = plt.figure(figsize=(16, 4))

plt.plot(loss_train_sgd, label='SGD')
plt.plot(loss_train_sgd_moment, label='SGD with momentum')
plt.plot(loss_train_adam, label='Adam')

plt.legend()
plt.show()

# Ссылки
- [Курс "Deep learning на пальцах", лекция 4](https://youtu.be/tnrbx7V9RbA)
- [Статья: Оптимизация градиентного спуска](http://ruder.io/optimizing-gradient-descent/)
- [Статья: Методы оптимизации нейронных сетей](https://habr.com/ru/post/318970/)