**Использование псевдоразметки. ДЗ.**

In [1]:
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
import random
import numpy as np

In [2]:
torch.manual_seed(123)
torch.cuda.manual_seed(123)
np.random.seed(123)
random.seed(123)
torch.backends.cudnn.deterministic = True

Начнем с загрузки датасета. Речевые данные (и модели, обучаемые на них) очень тяжелые, поэтому мы обойдемся чем-нибудь попроще.

In [3]:
train_dataset = \
    datasets.MNIST('./data', train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ]))
test_dataset = \
    datasets.MNIST('./data', train=False, transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ]))

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw



In [4]:
len(train_dataset), len(test_dataset)

(60000, 10000)

Итак, трейн состоит из 60000 картинок цифр. Для того, чтобы получше увидеть эффект от псевдолейблов, мы оставим только 100 этих картинок в качестве размеченных данных. Остальные 59900 будут в качестве неразмеченных. 

На масштабах 100 записей могут проявиться неприятные эффекты, если какие-то из классов не будут достаточно хорошо представлены. Чтобы этого избежать, будем аккуратно семплировать. Самый простой вариант - просто случайно разделять, пока не получится удачное разбиение.

Для начала определим удачность разбиения. Будем считать размеченный датасет хорошим, если из 100 примеров в нем есть хотя бы по 8 представителей каждого класса. Напишите функцию, которая делает такую проверку.

In [18]:
from collections import Counter

def check_dataset(dataset):
    labels = Counter([dataset[i][1] for i in range(len(dataset))])
    for value in labels.values():
        if value < 8:
            return False
    return True

In [19]:
sampling_iteration = 0
while True:
    labeled_train_dataset, unlabeled_train_dataset = torch.utils.data.random_split(train_dataset, [100, 59900])
    if check_dataset(labeled_train_dataset):
        break
    sampling_iteration += 1
print(f'Split the dataset after {sampling_iteration} resamplings')

Split the dataset after 32 resamplings


In [20]:
test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=64, shuffle=False)
labeled_train_loader = torch.utils.data.DataLoader(
    labeled_train_dataset, batch_size=64, shuffle=True)
unlabeled_train_loader = torch.utils.data.DataLoader(
    unlabeled_train_dataset, batch_size=64, shuffle=False)

Теперь, когда мы получили данные, определим архитектуру сети. Возьмем простую сверточную сетку с droupout'ом.

In [22]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 20, kernel_size=5)
        self.conv2 = nn.Conv2d(20, 40, kernel_size=5)
        self.dropout = nn.Dropout2d(p=0.5)
        self.fc1 = nn.Linear(640, 150)
        self.fc2 = nn.Linear(150, 10)
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = x.view(-1, 1, 28, 28)
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.dropout(self.conv2(x)), 2))
        x = x.view(-1, 640)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.log_softmax(x)
        return x

Опишем вспомогательные функции.

In [23]:
def train(epoch_idx, model, optimizer, train_loader, loss_func=F.nll_loss):
    model.train()
    for batch_idx, (x, target) in enumerate(train_loader):
        x, target = x.cuda(), target.cuda()
        optimizer.zero_grad()
        output = model(x)
        loss = loss_func(output, target)
        loss.backward()
        optimizer.step()

In [24]:
def test(epoch_idx, model, test_loader):
    model.eval()
    test_loss = 0.0
    correct = 0
    with torch.no_grad():
        for x, target in test_loader:
            x, target = x.cuda(), target.cuda()
            output = model(x)
            test_loss += F.nll_loss(output, target, size_average=False).item()
            pred = output.data.max(1, keepdim=True)[1]
            correct += pred.eq(target.data.view_as(pred)).long().cpu().sum()

    test_loss /= len(test_loader.dataset)
    print('Epoch {}: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format(
        epoch_idx, test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [25]:
def predict(model, loader):
    model.eval()
    result = []
    with torch.no_grad():
        for x, _ in loader:
            result.append(model(x.cuda()))
    return torch.cat(result)

Создадим модель и обучим ее на нашем размеченном датасете.

In [26]:
model = Net().cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

In [27]:
for i in range(400):
    train(i, model, optimizer, labeled_train_loader)
    if i % 10 == 0:
        test(i, model, test_loader)



Epoch 0: Average loss: 2.2969, Accuracy: 985/10000 (10%)
Epoch 10: Average loss: 1.7321, Accuracy: 5573/10000 (56%)
Epoch 20: Average loss: 0.7752, Accuracy: 7474/10000 (75%)
Epoch 30: Average loss: 0.5837, Accuracy: 8025/10000 (80%)
Epoch 40: Average loss: 0.7819, Accuracy: 7384/10000 (74%)
Epoch 50: Average loss: 0.5545, Accuracy: 8304/10000 (83%)
Epoch 60: Average loss: 0.5442, Accuracy: 8506/10000 (85%)
Epoch 70: Average loss: 0.5475, Accuracy: 8538/10000 (85%)
Epoch 80: Average loss: 0.5823, Accuracy: 8570/10000 (86%)
Epoch 90: Average loss: 0.6238, Accuracy: 8607/10000 (86%)
Epoch 100: Average loss: 0.6114, Accuracy: 8562/10000 (86%)
Epoch 110: Average loss: 0.6757, Accuracy: 8571/10000 (86%)
Epoch 120: Average loss: 0.6722, Accuracy: 8563/10000 (86%)
Epoch 130: Average loss: 0.6936, Accuracy: 8601/10000 (86%)
Epoch 140: Average loss: 0.7402, Accuracy: 8525/10000 (85%)
Epoch 150: Average loss: 0.6707, Accuracy: 8667/10000 (87%)
Epoch 160: Average loss: 0.6714, Accuracy: 8514/1000

Теперь попробуем побить этот результат с помощью псевдолейблов. Напишем функцию, которая принимает модель и возращает DataLoader с хард-лейблами, и запустим обучение.

In [28]:
from tqdm import tqdm

In [29]:
def get_pseudo_loader(model):
    dataset = list(unlabeled_train_dataset)
    model.eval()
    for i, (x, _) in tqdm(enumerate(dataset)):
        logit = model(x.cuda())
        dataset[i] = (x, logit.argmax().item())
        
    return torch.utils.data.DataLoader(
        dataset, batch_size=64, shuffle=True)

In [30]:
model_hard = Net().cuda()
model_hard.load_state_dict(model.state_dict())
optimizer_hard = torch.optim.SGD(model_hard.parameters(), lr=0.1)

In [31]:
hard_labeled_loader = get_pseudo_loader(model)
for i in range(10):
    train(i, model_hard, optimizer_hard, hard_labeled_loader)
    train(i, model_hard, optimizer_hard, labeled_train_loader)
    test(i, model_hard, test_loader)

59900it [00:35, 1667.42it/s]


Epoch 0: Average loss: 0.5402, Accuracy: 8588/10000 (86%)
Epoch 1: Average loss: 0.6602, Accuracy: 8535/10000 (85%)
Epoch 2: Average loss: 0.6212, Accuracy: 8631/10000 (86%)
Epoch 3: Average loss: 0.5993, Accuracy: 8639/10000 (86%)
Epoch 4: Average loss: 0.6269, Accuracy: 8620/10000 (86%)
Epoch 5: Average loss: 0.5706, Accuracy: 8662/10000 (87%)
Epoch 6: Average loss: 0.6369, Accuracy: 8590/10000 (86%)
Epoch 7: Average loss: 0.6309, Accuracy: 8585/10000 (86%)
Epoch 8: Average loss: 0.7123, Accuracy: 8574/10000 (86%)
Epoch 9: Average loss: 0.6989, Accuracy: 8542/10000 (85%)


**Итеративная псевдоразметка.**

Мы уже видим небольшое улучшение, но можно пойти дальше.

In [32]:
model_hard_iter = Net().cuda()
model_hard_iter.load_state_dict(model.state_dict())
optimizer_hard_iter = torch.optim.SGD(model_hard_iter.parameters(), lr=0.1)

In [33]:
for i in range(20):
    hard_labeled_loader = get_pseudo_loader(model_hard_iter)
    train(i, model_hard_iter, optimizer_hard_iter, hard_labeled_loader)
    train(i, model_hard_iter, optimizer_hard_iter, labeled_train_loader)
    test(i, model_hard_iter, test_loader)

59900it [00:36, 1659.29it/s]


Epoch 0: Average loss: 0.4662, Accuracy: 8688/10000 (87%)


59900it [00:36, 1637.39it/s]


Epoch 1: Average loss: 0.4809, Accuracy: 8796/10000 (88%)


59900it [00:36, 1642.88it/s]


Epoch 2: Average loss: 0.5236, Accuracy: 8875/10000 (89%)


59900it [00:35, 1669.64it/s]


Epoch 3: Average loss: 0.5514, Accuracy: 8921/10000 (89%)


59900it [00:36, 1647.28it/s]


Epoch 4: Average loss: 0.5144, Accuracy: 8902/10000 (89%)


59900it [00:36, 1647.55it/s]


Epoch 5: Average loss: 0.4742, Accuracy: 9045/10000 (90%)


59900it [00:36, 1645.90it/s]


Epoch 6: Average loss: 0.4765, Accuracy: 9057/10000 (91%)


59900it [00:36, 1641.08it/s]


Epoch 7: Average loss: 0.4821, Accuracy: 9091/10000 (91%)


59900it [00:36, 1659.04it/s]


Epoch 8: Average loss: 0.4470, Accuracy: 9131/10000 (91%)


59900it [00:36, 1658.25it/s]


Epoch 9: Average loss: 0.4813, Accuracy: 9106/10000 (91%)


59900it [00:36, 1633.27it/s]


Epoch 10: Average loss: 0.4224, Accuracy: 9170/10000 (92%)


59900it [00:36, 1651.53it/s]


Epoch 11: Average loss: 0.4229, Accuracy: 9184/10000 (92%)


59900it [00:35, 1667.04it/s]


Epoch 12: Average loss: 0.4177, Accuracy: 9215/10000 (92%)


59900it [00:36, 1643.95it/s]


Epoch 13: Average loss: 0.4284, Accuracy: 9254/10000 (93%)


59900it [00:39, 1522.86it/s]


Epoch 14: Average loss: 0.4634, Accuracy: 9253/10000 (93%)


59900it [00:37, 1583.54it/s]


Epoch 15: Average loss: 0.3997, Accuracy: 9265/10000 (93%)


59900it [00:37, 1615.88it/s]


Epoch 16: Average loss: 0.4365, Accuracy: 9257/10000 (93%)


59900it [00:36, 1657.76it/s]


Epoch 17: Average loss: 0.4220, Accuracy: 9307/10000 (93%)


59900it [00:37, 1611.89it/s]


Epoch 18: Average loss: 0.4229, Accuracy: 9249/10000 (92%)


59900it [00:36, 1633.66it/s]


Epoch 19: Average loss: 0.4403, Accuracy: 9262/10000 (93%)


**Оценивание.**

В предыдущем пункте нужно получить accuracy 91% или выше (5 баллов).

Следующие шаги:

Модифицировать функцию `get_pseudo_loader`, чтобы она могла возвращать софт-лейблы (+1 балл).

Правильно запустить обучение - в качестве лосса используем KL-дивергенцию. Получить accuracy 90% или выше. (+3 балла).

Интуитивно кажется, что модель не должна ничему учиться, т.к. ее выход будет полностью совпадать с софт-лейблами. Напишите (текстом), почему тем не менее удается сильно выиграть относительно бейзлайна. (+1 балл).

In [34]:
model_soft_iter = Net().cuda()
model_soft_iter.load_state_dict(model.state_dict())
optimizer_soft_iter = torch.optim.SGD(model_soft_iter.parameters(), lr=0.1)

In [35]:
def get_pseudo_loader(model, soft_label=False):
    dataset = list(unlabeled_train_dataset)
    model.eval()
    with torch.no_grad():
        if soft_label:
            for i, (x, _) in tqdm(enumerate(dataset)):
                logit = model(x.cuda())
                dataset[i] = (x, logit.squeeze())
        else:
            for i, (x, _) in tqdm(enumerate(dataset)):
                logit = model(x.cuda())
                dataset[i] = (x, logit.argmax().item())
                
    return torch.utils.data.DataLoader(
        dataset, batch_size=64, shuffle=True)

In [37]:
loss_fn = torch.nn.KLDivLoss(reduction='batchmean', log_target=True)

In [None]:
for i in range(10):
    soft_labeled_loader = get_pseudo_loader(model_soft_iter, soft_label=True)
    train(i, model_soft_iter, optimizer_soft_iter, soft_labeled_loader, loss_func=loss_fn)
    train(i, model_soft_iter, optimizer_soft_iter, labeled_train_loader)
    test(i, model_soft_iter, test_loader)

59900it [00:30, 1948.99it/s]


Epoch 0: Average loss: 0.5024, Accuracy: 8624/10000 (86%)


59900it [00:29, 2036.23it/s]


Epoch 1: Average loss: 0.4351, Accuracy: 8752/10000 (88%)


59900it [00:29, 2033.61it/s]


Epoch 2: Average loss: 0.3803, Accuracy: 8830/10000 (88%)


59900it [00:29, 2019.65it/s]


Epoch 3: Average loss: 0.3680, Accuracy: 8857/10000 (89%)


59900it [00:29, 2005.35it/s]


Epoch 4: Average loss: 0.3660, Accuracy: 8869/10000 (89%)


59900it [00:30, 1995.08it/s]


Epoch 5: Average loss: 0.3604, Accuracy: 8919/10000 (89%)


59900it [00:29, 2018.07it/s]


Epoch 6: Average loss: 0.3469, Accuracy: 8969/10000 (90%)


59900it [00:30, 1972.83it/s]


Epoch 7: Average loss: 0.3278, Accuracy: 9046/10000 (90%)


59900it [00:29, 2020.02it/s]


Epoch 8: Average loss: 0.3248, Accuracy: 9080/10000 (91%)
