# Импорты

In [1]:
import os
import random
import numpy as np
import pandas as pd
from collections import Counter

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset, Subset, random_split
import torch.optim as optim

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

from tqdm.notebook import tqdm

import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torchvision.datasets import CIFAR10

import pytorch_lightning as pl
from pytorch_lightning.callbacks import TQDMProgressBar, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from torchmetrics import Accuracy, F1Score, MetricCollection

Зафиксируем random_state

In [2]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

set_seed(42)

Random seed set as 42


# Загрузка данных

Скачаем датасет CIFAR10 в каталог data, разделив его сразу на Train и Test

In [3]:
cifar10_train = datasets.CIFAR10(root='./data', train=True, download=True, transform=transforms.ToTensor())
cifar10_test = datasets.CIFAR10(root='./data', train=False, download=True, transform=transforms.ToTensor())

Files already downloaded and verified
Files already downloaded and verified


# Модель AlexNet и Dataloader

Создадим модель AlexNet через библиотеку pytorch-lithing

In [4]:
class AlexNetModel(pl.LightningModule):
    def __init__(self):
        super(AlexNetModel, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),  # 32x32 -> 32x32
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),                  # 32x32 -> 16x16

            nn.Conv2d(64, 192, kernel_size=3, padding=1),           # 16x16 -> 16x16
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),                  # 16x16 -> 8x8

            nn.Conv2d(192, 384, kernel_size=3, padding=1),          # 8x8 -> 8x8
            nn.ReLU(inplace=True),

            nn.Conv2d(384, 256, kernel_size=3, padding=1),          # 8x8 -> 8x8
            nn.ReLU(inplace=True),

            nn.Conv2d(256, 256, kernel_size=3, padding=1),          # 8x8 -> 8x8
            nn.ReLU(inplace=True),

            nn.MaxPool2d(kernel_size=2, stride=2)                   # 8x8 -> 4x4
        )
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256 * 4 * 4, 1024),  # Подгонка под 256*4*4 вместо 256*6*6
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(1024, 512),
            nn.ReLU(inplace=True),
            nn.Linear(512, 10)
        )

        self.metrics = MetricCollection([
            Accuracy(task='multiclass', num_classes=10),
            F1Score(task='multiclass', num_classes=10, average='weighted'),
        ])

        self.val_metrics = self.metrics.clone(prefix='val_')
        self.test_metrics = self.metrics.clone(prefix='test_')

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self.forward(x)
        loss = F.cross_entropy(logits, y)
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self.forward(x)
        loss = F.cross_entropy(logits, y)
        self.val_metrics.update(logits, y)
        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def on_validation_epoch_end(self):
        self.log_dict(self.val_metrics.compute(), prog_bar=True, on_epoch=True)
        self.val_metrics.reset()

    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self.forward(x)
        self.test_metrics.update(logits, y)

    def on_test_epoch_end(self):
        self.log_dict(self.test_metrics.compute(), prog_bar=True, on_epoch=True)
        self.test_metrics.reset()

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=0.001)

Создадим DataModule для работы с CIFAR10

In [5]:
class CIFAR10DataModule(pl.LightningDataModule):
    def __init__(self, data_dir: str = "./data", batch_size: int = 128):
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])

    def prepare_data(self):
        CIFAR10(self.data_dir, train=True, download=True)
        CIFAR10(self.data_dir, train=False, download=True)

    def setup(self, stage: str):
        if stage == "fit":
              data_full = CIFAR10(self.data_dir, train=True, transform=self.transform)
              trainset, self.valset = random_split(data_full, [40000, 10000])
              train_indices = torch.randperm(len(trainset))[:10000]
              self.trainset = Subset(trainset, train_indices)

        if stage == "test":
            #self.testset = CIFAR10(self.data_dir, train=False, transform=self.transform)
            self.testset = CIFAR10(
                root=self.data_dir,
                train=False,
                download=True,
                transform=self.transform,
            )

    def train_dataloader(self):
        return DataLoader(self.trainset, batch_size=self.batch_size)

    def val_dataloader(self):
        return DataLoader(self.valset, batch_size=self.batch_size)

    def test_dataloader(self):
        return DataLoader(self.testset, batch_size=self.batch_size)

Создадим экземпляры DataModule и AlexNetModel

In [6]:
datamodule = CIFAR10DataModule()
cnn_model = AlexNetModel()

# Обучение модели

Обучим модель в течении 25 эпох

In [7]:
trainer = pl.Trainer(
    max_epochs=25,
    log_every_n_steps=10,
    ##########
    #accelerator='gpu',
    #devices=[0],
    ##########
    fast_dev_run=False,
    # limit_train_batches=10,
    # limit_val_batches=5,
    ##########
    callbacks=[TQDMProgressBar(refresh_rate=10)],
    logger=TensorBoardLogger(save_dir='lightning_logs', name='cifar10', version='model_v0.1_simple_2_epochs')

)
trainer.fit(model=cnn_model, datamodule=datamodule)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Files already downloaded and verified
Files already downloaded and verified



  | Name         | Type             | Params | Mode 
----------------------------------------------------------
0 | features     | Sequential       | 2.3 M  | train
1 | classifier   | Sequential       | 4.7 M  | train
2 | metrics      | MetricCollection | 0      | train
3 | val_metrics  | MetricCollection | 0      | train
4 | test_metrics | MetricCollection | 0      | train
----------------------------------------------------------
7.0 M     Trainable params
0         Non-trainable params
7.0 M     Total params
27.907    Total estimated model params size (MB)
31        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

d:\Documents\ITMO\Semestr_1\dl_course_2024\.venv\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
d:\Documents\ITMO\Semestr_1\dl_course_2024\.venv\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=25` reached.


Оценим работу модели на тестовомов наборе данных

In [8]:
trainer.test(datamodule=datamodule)



Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


Restoring states from the checkpoint path at lightning_logs\cifar10\model_v0.1_simple_2_epochs\checkpoints\epoch=24-step=1975.ckpt
Loaded model weights from the checkpoint at lightning_logs\cifar10\model_v0.1_simple_2_epochs\checkpoints\epoch=24-step=1975.ckpt
d:\Documents\ITMO\Semestr_1\dl_course_2024\.venv\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
 test_MulticlassAccuracy    0.6197999715805054
 test_MulticlassF1Score      0.619731068611145
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_MulticlassAccuracy': 0.6197999715805054,
  'test_MulticlassF1Score': 0.619731068611145}]

# Оценка

Посмотрим на логи

In [9]:
%reload_ext tensorboard
%tensorboard --logdir lightning_logs/

По логам видно, что модель обучалась успешно. И с каждой эпохой ошибка уменьшалась, а метрики росли.