# Курсовая работа "Активное обучение"

In [2]:
import numpy as np
import tqdm
import tqdm.notebook
import random
import copy
import matplotlib.pyplot as plt

from sklearn.metrics import f1_score

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, Subset
import torch.nn.functional as F

import torchvision
from torchvision import models, datasets, transforms

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# 1. Подготовка

### 1.1 Выбор датасета
В качестве датасета был выбран CIFAR-10

In [4]:
transform = transforms.Compose(
    [
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
    ]
)

In [5]:
df = torchvision.datasets.CIFAR10('./', download=True, train=True, transform=transform)

Files already downloaded and verified


In [6]:
df

Dataset CIFAR10
    Number of datapoints: 50000
    Root location: ./
    Split: Train
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
           )

In [7]:
df_test = torchvision.datasets.CIFAR10('./', download=True, train=False, transform=transform)

Files already downloaded and verified


In [8]:
df_test

Dataset CIFAR10
    Number of datapoints: 10000
    Root location: ./
    Split: Test
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
           )

In [9]:
train_loader = DataLoader(df, batch_size=128, shuffle=True, num_workers=2)
test_loader = DataLoader(df_test, batch_size=128, shuffle=False, num_workers=2)

### 1.2 Выбор базовой модели

Будем использовать EfficientNet-b0

In [10]:
def init_model():
    model = torchvision.models.efficientnet_b0(weights='DEFAULT')
    model.classifier = nn.Linear(model.classifier[1].in_features, 10)
    model = model.to(device)
    return model

### 1.3 Пайплайн обучения

In [11]:
def train_model(model, train_loader, epochs=10):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    model.train()

    for epoch in tqdm.notebook.tqdm(range(epochs)):
        running_loss = 0
        for images, labels in train_loader:
            images = images.to(device)
            labels = labels.to(device)

            output = model(images)
            loss = criterion(output, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
        print(f'epoch: {epoch + 1}, loss: {running_loss / len(train_loader):.3f}')

    return model

In [12]:
def test_model(model, test_loader):
    model.eval()
    preds_full = []
    labels_full = []

    with torch.no_grad():
        for images, labels in tqdm.notebook.tqdm(test_loader):
            images = images.to(device)
            labels = labels.to(device)

            output = model(images)
            _, preds = torch.max(output, 1)

            preds_full.extend(preds.cpu().numpy())
            labels_full.extend(labels.cpu().numpy())

    return f1_score(preds_full, labels_full, average='weighted')


### 1.4 Обучение базовой модели

Без активного обучения

In [22]:
model_full = init_model()
model_full = train_model(model_full, train_loader, epochs=10)

  0%|          | 0/10 [00:00<?, ?it/s]

epoch: 1, loss: 0.998
epoch: 2, loss: 0.588
epoch: 3, loss: 0.453
epoch: 4, loss: 0.367
epoch: 5, loss: 0.307
epoch: 6, loss: 0.269
epoch: 7, loss: 0.229
epoch: 8, loss: 0.199
epoch: 9, loss: 0.177
epoch: 10, loss: 0.153


In [23]:
model_full_score = test_model(model_full, test_loader)
model_full_score

  0%|          | 0/79 [00:00<?, ?it/s]

0.8400574904505016

Скор базовой модели без активного обучения = **0.840**

### 1.5 Обучение модели на подвыборке

Создание подвыборки

In [24]:
def create_truncated_df(df, fraction):
    indices = random.sample(range(len(df)), int(fraction * len(df)))
    return Subset(df, indices)

In [25]:
df_001 = create_truncated_df(df, 0.01)
df_01 = create_truncated_df(df, 0.1)
df_02 = create_truncated_df(df, 0.2)

In [26]:
len(df), len(df_001), len(df_01), len(df_02)

(50000, 500, 5000, 10000)

In [27]:
train_loader_001 = DataLoader(df_001, batch_size=128, shuffle=True, num_workers=2)
train_loader_01 = DataLoader(df_01, batch_size=128, shuffle=True, num_workers=2)
train_loader_02 = DataLoader(df_02, batch_size=128, shuffle=True, num_workers=2)

Обучим модели на подвыборках размером 1/10/20 процентов от исходного датасета.

Валидироать каждую модель будем на **всем тестовом датасете** (поэтому не обязательно прогонять модель 5 раз)

Модель на 1% данных без активного обучения

In [28]:
model_001 = init_model()
model_001 = train_model(model_001, train_loader_001, epochs=10)

  0%|          | 0/10 [00:00<?, ?it/s]

epoch: 1, loss: 2.316
epoch: 2, loss: 1.743
epoch: 3, loss: 1.257
epoch: 4, loss: 0.880
epoch: 5, loss: 0.573
epoch: 6, loss: 0.349
epoch: 7, loss: 0.216
epoch: 8, loss: 0.124
epoch: 9, loss: 0.101
epoch: 10, loss: 0.063


In [29]:
model_001_score = test_model(model_001, test_loader)
model_001_score

  0%|          | 0/79 [00:00<?, ?it/s]

0.4404503720809933

Скор модели на 1% без активного обучения = **0.440**

Модель на 10% данных без активного обучения

In [30]:
model_01 = init_model()
model_01 = train_model(model_01, train_loader_01, epochs=10)

  0%|          | 0/10 [00:00<?, ?it/s]

epoch: 1, loss: 1.808
epoch: 2, loss: 1.139
epoch: 3, loss: 0.834
epoch: 4, loss: 0.633
epoch: 5, loss: 0.566
epoch: 6, loss: 0.440
epoch: 7, loss: 0.333
epoch: 8, loss: 0.360
epoch: 9, loss: 0.315
epoch: 10, loss: 0.311


In [31]:
model_01_score = test_model(model_01, test_loader)
model_01_score

  0%|          | 0/79 [00:00<?, ?it/s]

0.6675478028743642

Скор модели на 10% без активного обучения = **0.668**

Модель на 20% данных без активного обучения

In [32]:
model_02 = init_model()
model_02 = train_model(model_02, train_loader_02, epochs=10)

  0%|          | 0/10 [00:00<?, ?it/s]

epoch: 1, loss: 1.504
epoch: 2, loss: 0.904
epoch: 3, loss: 0.643
epoch: 4, loss: 0.469
epoch: 5, loss: 0.369
epoch: 6, loss: 0.294
epoch: 7, loss: 0.256
epoch: 8, loss: 0.307
epoch: 9, loss: 0.310
epoch: 10, loss: 0.213


In [33]:
model_02_score = test_model(model_02, test_loader)
model_02_score

  0%|          | 0/79 [00:00<?, ?it/s]

0.743301245778547

Скор модели на 20% без активного обучения = **0.743**

# 2. Активное обучение
### 2.1 Пайплайн для активного обучения

Инициализация данных для активного обучения - берем рандомные fraction процентов

In [13]:
def init_labelling(dataset, fraction):
    random.seed(0xDEAD)
    labeled_size = int(len(dataset) * fraction)
    labeled_ids = random.sample(range(len(dataset)), labeled_size)
    unlabeled_ids = list(set(range(len(dataset))) - set(labeled_ids))

    return labeled_ids, unlabeled_ids

Пайплайн для обучения

In [14]:
def active_learning_pipeline(dataset, fraction, test_loader, al_method, al_iters_cnt=5):
    labeled_ids, unlabeled_ids = init_labelling(dataset, fraction)
    labeled_df = Subset(dataset, labeled_ids)
    unlabeled_df = Subset(dataset, unlabeled_ids)
    labeled_loader = DataLoader(labeled_df, batch_size=128, shuffle=True, num_workers=2)
    unlabeled_loader = DataLoader(unlabeled_df, batch_size=128, shuffle=True, num_workers=2)

    # train on labeled data
    model = init_model()
    model = train_model(model, labeled_loader, 2)

    for i in range(al_iters_cnt):
        print(f'Active learning: iter {i + 1}')
        # get new data with active learning algorithm
        al_indices = al_method(model, unlabeled_loader, int(len(dataset) * 0.01))
        al_data_indices = [unlabeled_ids[i] for i in al_indices]

        # add new data to training dataset
        labeled_ids.extend(al_data_indices)
        unlabeled_ids = list(set(unlabeled_ids) - set(al_data_indices))
        labeled_df = Subset(dataset, labeled_ids)
        labeled_loader = DataLoader(labeled_df, batch_size=128, shuffle=True)
        unlabeled_df = Subset(dataset, unlabeled_ids)
        unlabeled_loader = DataLoader(unlabeled_df, batch_size=128, shuffle=True)

        # update model
        model = train_model(model, labeled_loader, epochs=2)

    score = test_model(model, test_loader)
    model = model.cpu()
    return model, score



### 2.2 Методы для активного обучения

#### Алгоритм Least Confidence (LC)

*Алгоритм Least Confidence (LC)* - это метод активного обучения, который выбирает сэмплы для разметки данных на основе уверенности модели в своих предсказаниях. Предположим, у нас есть обученная модель, и мы хотим выбрать несколько сэмплов из неразмеченного набора данных для разметки. Мы будем использовать модель для предсказания вероятностей классов и выбирать те сэмплы, для которых модель имеет наименьшую уверенность.

In [15]:
def least_confidence_selection(model, unlabeled_data, n_samples):
    """
    Выбирает n_samples из unlabeled_data на основе алгоритма Least Confidence.

    :param model: Обученная модель PyTorch
    :param unlabeled_data: Набор данных без меток (например, DataLoader)
    :param n_samples: Количество образцов для выбора
    :return: Индексы выбранных образцов
    """
    model.eval()
    confidences = []
    with torch.no_grad():
        for inputs, _ in unlabeled_data:
            inputs = inputs.to(device)
            outputs = model(inputs)
            probabilities = F.softmax(outputs, dim=1)
            least_confidence = 1 - torch.max(probabilities, dim=1)[0]
            confidences.append(least_confidence)
    confidences = torch.cat(confidences)
    selected_indices = torch.argsort(confidences)[:n_samples]
    return selected_indices.tolist()

#### Алгоритм Coreset

*Алгоритм Coreset (или "координационный набор")* — это метод активного обучения, который выбирает подмножество данных, представляющее весь набор данных, чтобы минимизировать ошибку модели.

In [16]:
def coreset_selection(model, unlabeled_data, n_samples):
    """
    Выбирает n_samples из unlabeled_data на основе алгоритма Coreset.

    :param model: Обученная модель PyTorch
    :param unlabeled_data: Набор данных без меток (например, DataLoader)
    :param n_samples: Количество образцов для выбора
    :return: Индексы выбранных образцов
    """
    model.eval()
    embeddings = []
    labels = []
    with torch.no_grad():
        for inputs, _ in unlabeled_data:
            inputs = inputs.to(device)
            outputs = model(inputs)
            probabilities = F.softmax(outputs, dim=1)
            embeddings.append(outputs)
            labels.append(probabilities)
    embeddings = torch.cat(embeddings).cpu()
    labels = torch.cat(labels).cpu()
    distances = torch.cdist(embeddings, embeddings)
    selected_indices = []
    for _ in range(n_samples):
        if len(selected_indices) == 0:
            idx = np.random.choice(len(embeddings))
        else:
            remaining_indices = list(set(range(len(embeddings))) - set(selected_indices))
            distances_to_selected = distances[remaining_indices][:, selected_indices].min(axis=1)[0]
            idx = remaining_indices[distances_to_selected.argmax()]
        selected_indices.append(idx)
    return selected_indices

#### Алгоритм Maximum Normalized Log-Probability (MNLP)

*Алгоритм Maximum Normalized Log-Probability (MNLP)* — это метод активного обучения, который выбирает образцы для аннотирования на основе нормализованной логарифмической вероятности предсказаний модели.

In [17]:
def mnlp_selection(model, unlabeled_data, n_samples):
    """
    Выбирает n_samples из unlabeled_data на основе алгоритма Maximum Normalized Log-Probability (MNLP).

    :param model: Обученная модель PyTorch
    :param unlabeled_data: Набор данных без меток (например, DataLoader)
    :param n_samples: Количество образцов для выбора
    :return: Индексы выбранных образцов
    """
    model.eval()
    log_probabilities = []
    with torch.no_grad():
        for inputs, _ in unlabeled_data:
            inputs = inputs.to(device)
            outputs = model(inputs)
            probabilities = F.softmax(outputs, dim=1)
            log_probs = torch.log(probabilities + 1e-10)
            log_probabilities.append(log_probs)

    log_probabilities = torch.cat(log_probabilities)
    normalized_log_probs = log_probabilities - log_probabilities.max(dim=1, keepdim=True)[0]
    max_normalized_log_probs = normalized_log_probs.max(dim=1)[0]
    selected_indices = torch.argsort(max_normalized_log_probs, descending=True)[:n_samples]
    return selected_indices.tolist()

### 2.3 Активное обучение моделей

In [18]:
torch.cuda.empty_cache()

In [19]:
al_methods = [
    # least_confidence_selection,
    coreset_selection,
    # mnlp_selection,
]

In [20]:
fractions = [0.01, 0.1] # 0.2]

In [21]:
models = dict()  # models[fraction][method] = model
models['0.01'] = {}
models['0.1'] = {}
models['0.2'] = {}

In [22]:
scores = dict()  # scores[fraction][method] = score
scores['0.01'] = {}
scores['0.1'] = {}
scores['0.2'] = {}

In [192]:
"""
models['0.01']['full_data'] = model_full
models['0.1']['full_data'] = model_full
models['0.2']['full_data'] = model_full

models['0.01']['no_active_learn'] = model_001
models['0.1']['no_active_learn'] = model_01
models['0.2']['no_active_learn'] = model_02
"""

In [193]:
"""
scores['0.01']['full_data'] = model_full_score
scores['0.1']['full_data'] = model_full_score
scores['0.2']['full_data'] = model_full_score

scores['0.01']['no_active_learn'] = model_001_score
scores['0.1']['no_active_learn'] = model_01_score
scores['0.2']['no_active_learn'] = model_02_score
"""

In [None]:
for fraction in fractions:
    for method in al_methods:
        print(f'Method: {method.__name__}, fraction: {fraction}')
        model, score = active_learning_pipeline(
            dataset=df,
            fraction=fraction,
            test_loader=test_loader,
            al_method=method,
            al_iters_cnt=2,
        )

        # models[str(fraction)][method.__name__] = model.cpu()
        scores[str(fraction)][method.__name__] = score

Method: coreset_selection, fraction: 0.01


  0%|          | 0/2 [00:00<?, ?it/s]

epoch: 1, loss: 2.346
epoch: 2, loss: 1.712
Active learning: iter 1


# 3. Сравнение результатов

In [None]:
plt.figure(figsize=(15, 8))

