In [26]:
from itertools import product
import logging

import numpy as np
from numpy.typing import NDArray
import pandas as pd
import torch
from torch import nn
import torchmetrics
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm.notebook import tqdm as progress
import mlflow

from model import MNISTClassifier

### Конфигурация

In [28]:
# Пути и константы
TRAIN_CSV_PATH = 'data/train_data.csv'
TEST_CSV_PATH = 'data/test_data.csv'
VALIDATION_RATIO = .2
EPOCHS = 3

# Готовим сетку гиперпараметров
LEARNING_RATES = [.05, .1, .2]
BATCH_SIZES = [32, 64]
HP_GRID = list(product(BATCH_SIZES, LEARNING_RATES))

# Назначаем имя эксперименту в MLFlow
mlflow.set_experiment('MNIST')

logger = logging.getLogger("mlflow")
logger.setLevel(logging.ERROR)

%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


### Подготавливаем данные

In [29]:
def preprocess(train_path: str, test_path: str) -> tuple[NDArray, NDArray]:

  train_df = pd.read_csv(train_path, header=None, dtype=float)
  test_df = pd.read_csv(test_path, header=None, dtype=float)

  train_mean = train_df.values[:, 1:].mean()
  train_std = train_df.values[:, 1:].std()

  train_df.values[:, 1:] -= train_mean
  train_df.values[:, 1:] /= train_std
  test_df.values[:, 1:] -= train_mean
  test_df.values[:, 1:] /= train_std

  return train_df.values, test_df.values


class MNISTDatasetCsv(Dataset):
  def __init__(self, data: NDArray):
    self.data = torch.tensor(data[:, 1:]).float().reshape(data.shape[0], 1, 28, 28)
    self.labels = torch.tensor(data[:, 0]).long()

  def __len__(self) -> int:
    return self.labels.shape[0]

  def __getitem__(self, index) -> tuple[torch.Tensor, torch.Tensor]:
    return self.data[index], self.labels[index]

In [30]:
test_data, train_data = preprocess(
  train_path=TRAIN_CSV_PATH,
  test_path=TEST_CSV_PATH
)

# Создаем torch-датасет 
dataset = MNISTDatasetCsv(train_data)

# Делим тренировочную выборку на данные для обучения и для валидации 
train_size = int(VALIDATION_RATIO * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Подготавливаем тестовую выборку
X_test = torch.tensor(test_data[:, 1:]).float().reshape(test_data.shape[0], 1, 28, 28)
y_test = torch.tensor(test_data[:, 0]).long()

### Обучаем модельку

In [32]:
loss_fn = nn.CrossEntropyLoss()
metric_fn = torchmetrics.Accuracy(task='multiclass', num_classes=10)

# Цикл по сетке гиперпараметров
for batch_size, lr in progress(HP_GRID, total=len(HP_GRID)):

  # Стартуем новый "прогон" (run) эксперимента
  with mlflow.start_run():

    # Сохраняем гиперпараметры
    mlflow.log_params({'batch_size': batch_size, 'lr': lr})

    # Создаем соответствующие torch-даталоадеры
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

    # Создаем модель и то, что нужно для ее обучения
    model = MNISTClassifier()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    train_step, val_step = 1, 1
    
    # Цикл по количеству эпох
    for _ in progress(range(EPOCHS), total=EPOCHS, desc='Epochs', leave=False):
      
      # Тренировочный цикл 
      model.train()
      for X, y in progress(train_dataloader, total=len(train_dataloader), desc='Training', leave=False):
        pred = model(X)
        loss = loss_fn(pred, y)
        accuracy = metric_fn(pred, y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        mlflow.log_metric(f'training/loss', loss, train_step)
        mlflow.log_metric(f'training/accuracy', accuracy, train_step)

        train_step += 1

      # Валидационный цикл
      with torch.no_grad():
        for X, y in progress(val_dataloader, total=len(val_dataloader), desc='Validation', leave=False):
          pred = model(X)
          loss = loss_fn(pred, y)
          accuracy = metric_fn(pred, y)
              
          mlflow.log_metric(f'validation/loss', loss, val_step)
          mlflow.log_metric(f'validation/accuracy', accuracy, val_step)

          val_step += 1

    # Тестирование обученной модели 
    model.eval()
    pred = model(X_test)
    loss = loss_fn(pred, y_test)
    accuracy = metric_fn(pred, y_test)
    mlflow.log_metric('loss', loss)
    mlflow.log_metric('accuracy', accuracy)
    mlflow.pytorch.log_model(
      model,
      'MNISTClassifier',
      input_example=np.zeros(shape=(1, 1, 28, 28), dtype=np.float32),
      code_paths=['model.py']
    )
    

  0%|          | 0/6 [00:00<?, ?it/s]

Epochs:   0%|          | 0/3 [00:00<?, ?it/s]

Training:   0%|          | 0/63 [00:00<?, ?it/s]

Validation:   0%|          | 0/250 [00:00<?, ?it/s]

Training:   0%|          | 0/63 [00:00<?, ?it/s]

Validation:   0%|          | 0/250 [00:00<?, ?it/s]

Training:   0%|          | 0/63 [00:00<?, ?it/s]

Validation:   0%|          | 0/250 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Epochs:   0%|          | 0/3 [00:00<?, ?it/s]

Training:   0%|          | 0/63 [00:00<?, ?it/s]

Validation:   0%|          | 0/250 [00:00<?, ?it/s]

Training:   0%|          | 0/63 [00:00<?, ?it/s]

Validation:   0%|          | 0/250 [00:00<?, ?it/s]

Training:   0%|          | 0/63 [00:00<?, ?it/s]

Validation:   0%|          | 0/250 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Epochs:   0%|          | 0/3 [00:00<?, ?it/s]

Training:   0%|          | 0/63 [00:00<?, ?it/s]

Validation:   0%|          | 0/250 [00:00<?, ?it/s]

Training:   0%|          | 0/63 [00:00<?, ?it/s]

Validation:   0%|          | 0/250 [00:00<?, ?it/s]

Training:   0%|          | 0/63 [00:00<?, ?it/s]

Validation:   0%|          | 0/250 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Epochs:   0%|          | 0/3 [00:00<?, ?it/s]

Training:   0%|          | 0/32 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Training:   0%|          | 0/32 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Training:   0%|          | 0/32 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Epochs:   0%|          | 0/3 [00:00<?, ?it/s]

Training:   0%|          | 0/32 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Training:   0%|          | 0/32 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Training:   0%|          | 0/32 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Epochs:   0%|          | 0/3 [00:00<?, ?it/s]

Training:   0%|          | 0/32 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Training:   0%|          | 0/32 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Training:   0%|          | 0/32 [00:00<?, ?it/s]

Validation:   0%|          | 0/125 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]