In [1]:
%load_ext tensorboard

import shutil


import torch
import torchvision
import torch.nn.functional as F
import torchvision.transforms as transforms

import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.metrics import functional as FM

Microsoft Visual C++ Redistributable is not installed, this may lead to the DLL load failure.
                 It can be downloaded at https://aka.ms/vs/16/release/vc_redist.x64.exe




# LightningDataModule

In [2]:
class CIFAR10Dataset(pl.LightningDataModule):
    def __init__(self):
        super().__init__()
        self.batch_size_ = 16
        self.transforms = transforms.Compose([
#             transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            transforms.ToTensor()
        ])
        
    def prepare_data(self):
        self.train_ds_ = torchvision.datasets.CIFAR10('data/', train=True, transform=self.transforms, download=True)
        
    def setup(self, stage=None):
        val_length = int(0.2 * len(self.train_ds_))
        train_length = len(self.train_ds_) - val_length
        self.train_ds_, self.val_ds_ = torch.utils.data.random_split(self.train_ds_, [train_length, val_length])
    
    def train_dataloader(self):
        return torch.utils.data.DataLoader(self.train_ds_, batch_size=self.batch_size_, shuffle=True)
    
    def val_dataloader(self):
        return torch.utils.data.DataLoader(self.val_ds_, batch_size=self.batch_size_, shuffle=False)
    
    def test_dataloader(self):
        transform = transforms.Compose([transforms.ToTensor()])
        dataset = torchvision.datasets.CIFAR10('data/', train=False, transform=self.transforms, download=True)
        loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=self.batch_size_, shuffle=False)
        return loader

# LightningModule

In [3]:
class ClassificationTask(pl.LightningModule):
    def __init__(self, num_classes=10):
        super().__init__()
        self.lr = 0.01
        self.model = torchvision.models.vgg11(pretrained=True, progress=False)
        self.model.classifier[6] = torch.nn.Linear(4096, num_classes)
        
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.model(x)
        loss = F.cross_entropy(y_hat, y)
        result = pl.TrainResult(loss)
        result.log('train_loss', loss)
        return result

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.model(x)
        loss = F.cross_entropy(y_hat, y)
        acc = FM.accuracy(y_hat, y)

        # loss is tensor. The Checkpoint Callback is monitoring 'checkpoint_on'
        result = pl.EvalResult(checkpoint_on=loss)
        result.log_dict({'val_acc': acc, 'val_loss': loss})
        return result

    def test_step(self, batch, batch_idx):
        result = self.validation_step(batch, batch_idx)
        result.rename_keys({'val_acc': 'test_acc', 'val_loss': 'test_loss'})
        return result

    def configure_optimizers(self):
        return torch.optim.Adam(self.model.parameters(), lr=self.lr)

In [4]:
model   = ClassificationTask()
dataset = CIFAR10Dataset()

# Tensorboard

In [5]:
try:
    shutil.rmtree('lightning_logs')
except FileNotFoundError:
    pass

%tensorboard --logdir lightning_logs

Reusing TensorBoard on port 6006 (pid 29776), started 1:38:25 ago. (Use '!kill 29776' to kill it.)

# Find LR

In [6]:
logger = TensorBoardLogger(save_dir='lightning_logs', version=1, name='lightning_logs')
trainer = pl.Trainer(gpus=1, max_epochs=20, logger=logger)

lr_finder = trainer.lr_find(model, dataset)

fig = lr_finder.plot(suggest=True)
fig.show()

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]


Files already downloaded and verified



  | Name  | Type | Params
-------------------------------
0 | model | VGG  | 128 M 


HBox(children=(FloatProgress(value=0.0, description='Finding best initial lr', style=ProgressStyle(description…

RuntimeError: CUDA out of memory. Tried to allocate 392.00 MiB (GPU 0; 4.00 GiB total capacity; 2.18 GiB already allocated; 339.62 MiB free; 2.40 GiB reserved in total by PyTorch)

In [None]:
# trainer

# > <pytorch_lightning.trainer.trainer.Trainer at 0x1b886c29d90>

In [None]:
# lr_finder = trainer.tuner.lr_find(model, dataset)

# > AttributeError: 'Trainer' object has no attribute 'tuner'

# Training

In [None]:
# model.lr = lr_finder.suggestion()

model.lr

In [None]:
trainer.fit(model, dataset)

In [None]:
trainer.test(model, ckpt_path='best', verbose=True, datamodule=dataset)