In [1]:
import torch

print(torch.__version__)
print(torch.cuda.is_available())

torch.set_float32_matmul_precision('medium') ## 'high'

2.0.1
True


# torchvision dataset

In [2]:
from torchvision import datasets
from torchvision import transforms
from torch.utils.data import random_split, DataLoader

data_dir = "/home/pervinco/Datasets/MNIST"
transform = transforms.ToTensor()
train_dataset = datasets.MNIST(root=data_dir, download=True, train=True, transform=transform)
test_dataset = datasets.MNIST(root=data_dir, download=True, train=False, transform=transform)

In [3]:
train_size = int(len(train_dataset) * 0.8)
valid_size = len(train_dataset) - train_size

seed = torch.Generator().manual_seed(42)
train_dataset, valid_dataset = random_split(train_dataset, [train_size, valid_size], generator=seed)

In [4]:
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=32)
valid_dataloader = DataLoader(valid_dataset, batch_size=128, shuffle=False, num_workers=32)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False, num_workers=32)

# Custom Dataset

```LightningDataModule``` 클래스를 상속받아서 사용자 데이터셋 클래스를 정의한다.

- prepare_data 메서드에서는 데이터를 다운로드 하는 등의 준비를 명시.
- setup은 데이터를 로드하고 처리하는 과정을 정의.
    - count number of classes
    - build vocabulary
    - perform train/val/test splits
    - create datasets
    - apply transforms (defined explicitly in your datamodule)

In [5]:
import torch
from torchvision import datasets
from torchvision import transforms
from pytorch_lightning import LightningDataModule
from torch.utils.data import random_split, DataLoader

class DataModule(LightningDataModule):
    def __init__(self, 
                 data_dir="path/to/dir", 
                 batch_size=32, 
                 num_workers=4,
                 transform=transforms.ToTensor()):
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.transform = transform

    def prepare_data(self):
        ## 데이터 다운로드(준비)
        datasets.MNIST(root=self.data_dir, download=True, train=True, transform=None)
        datasets.MNIST(root=self.data_dir, download=True, train=False, transform=None)

    def setup(self, stage=None):
        if stage == 'fit' or stage is None:
            full_train_dataset = datasets.MNIST(root=self.data_dir, download=True, train=True, transform=self.transform)
            self.train_dataset, self.valid_dataset = random_split(full_train_dataset, [55000, 5000], generator=torch.Generator().manual_seed(42))

        if stage == 'test' or stage == 'predict':
            self.test_dataset = datasets.MNIST(root=self.data_dir, download=True, train=False, transform=self.transform)
        
    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.valid_dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)

    def predict_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=self.num_workers, shuffle=False)


In [6]:
data_module = DataModule(data_dir=data_dir, batch_size=32, transform=transforms.ToTensor())

# 모델 정의

모델을 정의할 때는 pytorch_lightning의 ```LightningModule```을 상속받도록 한다.  
클래스에는 다음과 같은 메서드들이 필수적으로 구현되어야 한다.
- ```__init__```
- forward
- configure_optimizers : 최적화 알고리즘을 명시하며, 필요에 따라 스케쥴러도 적용 가능.
- training_step : 모델 학습으로, validation_step, test_step도 동일한 방식으로 정의할 수 있다.

In [7]:
from torch import nn
from torchmetrics import Accuracy
from pytorch_lightning import LightningModule

class CNN(LightningModule):
    def __init__(self, num_classes, learning_rate, dropout_ratio, use_shceduler):
        super().__init__()
        self.num_classes = num_classes
        self.learning_rate = learning_rate
        self.dropout_ratio = dropout_ratio
        self.use_scheduler = use_shceduler
        self.save_hyperparameters()

        self.criterion = nn.CrossEntropyLoss()
        self.accuracy = Accuracy(task="multiclass", num_classes=num_classes)

        self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=5)  # [BATCH_SIZE, 1, 28, 28] -> [BATCH_SIZE, 16, 24, 24]
        self.relu1 = nn.ReLU()
        
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=5) # [BATCH_SIZE, 16, 24, 24] -> [BATCH_SIZE, 32, 20, 20]
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(kernel_size=2) # [BATCH_SIZE, 32, 20, 20] -> [BATCH_SIZE, 32, 10, 10]
        self.dropout2 = nn.Dropout(dropout_ratio)
        
        self.conv3 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=5) # [BATCH_SIZE, 32, 10, 10] -> [BATCH_SIZE, 64, 6, 6]
        self.relu3 = nn.ReLU()
        self.pool3 = nn.MaxPool2d(kernel_size=2) # 크기를 1/2로 줄입니다. [BATCH_SIZE, 64, 6, 6] -> [BATCH_SIZE, 64, 3, 3]
        self.dropout3 = nn.Dropout(dropout_ratio)

        self.output = nn.Linear(64 * 3 * 3, self.num_classes)
    

    def forward(self, x):
        x = self.conv1(x)
        x = self.relu1(x)

        x = self.conv2(x)
        x = self.relu2(x)
        x = self.pool2(x)
        x = self.dropout2(x)

        x = self.conv3(x)
        x = self.relu3(x)
        x = self.pool3(x)
        x = self.dropout3(x)

        x = x.view(x.size(0), -1)
        x = self.output(x)

        return x
    

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)

        if self.use_scheduler:
            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=10, factor=0.1, verbose=True)
            return [optimizer], [scheduler]
        else:
            return optimizer
        

    def training_step(self, batch, batch_idx):
        ## model.train()을 생략.
        x, y = batch ## to(device)를 생략.
        y_pred = self(x) ## outputs = model(images)

        loss = self.criterion(y_pred, y)
        acc = self.accuracy(y_pred, y)
        ## loss.backward(), optimizer.step()은 생략한다. 라이트닝이 자동으로 수행.

        self.log("train_loss", loss, on_step=False, on_epoch=True, logger=True)
        self.log("train_acc", acc, on_step=False, on_epoch=True, logger=True)

        return loss
    

    def validation_step(self, batch, batch_idx):
        ## valid, test에서 사용하던 model.eval()과 torch.no_grad()를 생략한다. 자동으로 수행함.
        x, y = batch
        y_pred = self(x)
        loss = self.criterion(y_pred, y)
        
        _, preds = torch.max(y_pred, dim=1) ## [batch_size, num_classes]. num_classes 중 최고 확률 하나 선택.
        acc = self.accuracy(preds, y)

        self.log("valid_loss", loss, on_step=False, on_epoch=True, logger=True)
        self.log("valid_acc", acc, on_step=False, on_epoch=True, logger=True)


    def test_step(self, batch, batch_idx):
        x, y = batch
        y_pred = self(x)
        loss = self.criterion(y_pred, y)
        
        _, preds = torch.max(y_pred, dim=1) ## [batch_size, num_classes]. num_classes 중 최고 확률 하나 선택.
        acc = self.accuracy(preds, y)

        self.log("test_loss", loss, on_step=False, on_epoch=True, logger=True)
        self.log("test_acc", acc, on_step=False, on_epoch=True, logger=True)


    def predict_step(self, batch, batch_idx):
        x, _ = batch
        predictions = self(x)
        _, preds = torch.max(predictions, dim=1)

        return preds


    def weight_initialization(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                nn.init.zeros_(m.bias)


    def count_parameters(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

In [8]:
## 모델 인스턴스 생성
model = CNN(num_classes=10, learning_rate=0.01, dropout_ratio=0.2, use_shceduler=False)

# Trainer

[https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.trainer.trainer.Trainer.html#trainer](https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.trainer.trainer.Trainer.html#trainer)  

- Trainer 클래스의 인스턴스를 생성한다. 여기에는 epoch, gpu 사용, 콜백 기능, 로깅 기능 등을 명시한다.
- trainer 인스턴스에서 fit 함수를 이용해 학습을 수행. model, train_loader, valid_loader 를 반영한다.

In [9]:
from pytorch_lightning.trainer import Trainer
from pytorch_lightning.loggers.csv_logs import CSVLogger
from pytorch_lightning.loggers.tensorboard import TensorBoardLogger
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint

save_dir = "./runs"
csv_logger = CSVLogger(save_dir=f"{save_dir}/logs", name="train_csv")
tb_logger = TensorBoardLogger(save_dir=f"{save_dir}/logs", name="train_tb")

early_stop_callback = EarlyStopping(monitor="valid_loss", mode='min')
save_ckpt_callback = ModelCheckpoint(
    dirpath=f"{save_dir}/weights",
    monitor="valid_loss",
    mode="min",
    filename="{epoch}-{valid_loss:.2f}",  # 모델 체크포인트 파일 이름 설정
    save_last=True,
    save_weights_only=True,
    verbose=True,
    save_top_k=3  # 가장 좋은 3개의 모델만 저장
)


trainer = Trainer(
    max_epochs=100,
    accelerator="gpu", ## 또는 auto로 설정하면 알아서 선택함.
    callbacks=[early_stop_callback, save_ckpt_callback],
    logger=[csv_logger, tb_logger],
    default_root_dir="./runs" ## 저장경로
)

# trainer.fit(
#     model, 
#     train_dataloader, 
#     valid_dataloader,
#     ckpt_path=None, ## resume할 가중치 파일 경로
# )

trainer.fit(model, datamodule=data_module)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name      | Type               | Params
--------------------------------------------------
0  | criterion | CrossEntropyLoss   | 0     
1  | accuracy  | MulticlassAccuracy | 0     
2  | conv1     | Conv2d             | 416   
3  | relu1     | ReLU               | 0     
4  | conv2     | Conv2d             | 12.8 K
5  | relu2     | ReLU               | 0     
6  | pool2     | MaxPool2d          | 0     
7  | dropout2  | Dropout            | 0     
8  | conv3     | Conv2d             | 51.3 K
9  | relu3     | ReLU               | 0     
10 | pool3     | MaxPool2d          | 0     
11 | dropout3  | Dropout            | 0     
12 | output    | Linear             | 5.8 K 
-----------------------------------------------

Epoch 0: 100%|██████████| 1719/1719 [00:05<00:00, 288.35it/s, v_num=0]     

Epoch 0, global step 1719: 'valid_loss' reached 0.06565 (best 0.06565), saving model to './runs/weights/epoch=0-valid_loss=0.07.ckpt' as top 3


Epoch 1: 100%|██████████| 1719/1719 [00:06<00:00, 284.23it/s, v_num=0]

Epoch 1, global step 3438: 'valid_loss' reached 0.04522 (best 0.04522), saving model to './runs/weights/epoch=1-valid_loss=0.05.ckpt' as top 3


Epoch 2: 100%|██████████| 1719/1719 [00:06<00:00, 278.30it/s, v_num=0]

Epoch 2, global step 5157: 'valid_loss' reached 0.03241 (best 0.03241), saving model to './runs/weights/epoch=2-valid_loss=0.03.ckpt' as top 3


Epoch 3: 100%|██████████| 1719/1719 [00:05<00:00, 288.40it/s, v_num=0]

Epoch 3, global step 6876: 'valid_loss' reached 0.04063 (best 0.03241), saving model to './runs/weights/epoch=3-valid_loss=0.04.ckpt' as top 3


Epoch 4: 100%|██████████| 1719/1719 [00:06<00:00, 277.53it/s, v_num=0]

Epoch 4, global step 8595: 'valid_loss' reached 0.03073 (best 0.03073), saving model to './runs/weights/epoch=4-valid_loss=0.03.ckpt' as top 3


Epoch 5: 100%|██████████| 1719/1719 [00:06<00:00, 277.73it/s, v_num=0]

Epoch 5, global step 10314: 'valid_loss' reached 0.03573 (best 0.03073), saving model to './runs/weights/epoch=5-valid_loss=0.04.ckpt' as top 3


Epoch 6: 100%|██████████| 1719/1719 [00:06<00:00, 285.62it/s, v_num=0]

Epoch 6, global step 12033: 'valid_loss' reached 0.03432 (best 0.03073), saving model to './runs/weights/epoch=6-valid_loss=0.03.ckpt' as top 3


Epoch 7: 100%|██████████| 1719/1719 [00:06<00:00, 279.07it/s, v_num=0]

Epoch 7, global step 13752: 'valid_loss' reached 0.03140 (best 0.03073), saving model to './runs/weights/epoch=7-valid_loss=0.03.ckpt' as top 3


Epoch 7: 100%|██████████| 1719/1719 [00:06<00:00, 278.85it/s, v_num=0]


In [10]:
trainer.test(
    model,
    test_dataloader,
    ckpt_path="last.ckpt",
    verbose=True
)

Restoring states from the checkpoint path at last.ckpt


FileNotFoundError: Checkpoint at last.ckpt not found. Aborting training.