In [1]:
from cnns.simple_cnn.cnn import CNN
from notebooks.mnist import get_data

In [2]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime

device = torch.device('cuda:0')

In [3]:
# hyper params
channels = [128, 256]
epochs = 5
lr = 0.01


In [4]:
train_dataset, eval_dataset = get_data()

In [5]:
train_dl = DataLoader(train_dataset, batch_size=16, shuffle=True)
eval_dl = DataLoader(eval_dataset, batch_size=8, shuffle=True)

In [6]:
model = CNN(channels, 1)
model.to(device)

CNN(
  (layers): Sequential(
    (0): ConvBlock(
      (conv): Conv2d(1, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (3): ConvBlock(
      (conv): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (bn): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  )
  (fc): Linear(in_features=4096, out_features=10, bias=True)
)

In [7]:
loss_func = nn.CrossEntropyLoss()
loss_func.to(device)

CrossEntropyLoss()

In [8]:
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)

In [9]:
def train_one_epoch(num_epoch, tb_writer):
    model.train()
    running_loss = 0
    last_loss = 0
    
    for i, data in enumerate(train_dl):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(inputs)
        
        loss = loss_func(outputs, labels)
        
        loss.backward()
        
        optimizer.step()
        
        running_loss += loss.item()
        if i % 1000 == 999:
            last_loss = running_loss / 1000 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = num_epoch * len(train_dl) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.
    return last_loss

In [10]:
def eval_one_epoch(num_epoch, tb_writer):
    model.eval()
    running_eval_loss = 0.0
    
    for i, eval_data in enumerate(eval_dl):
        eval_inputs, eval_labels = eval_data
        eval_inputs, eval_labels = eval_inputs.to(device), eval_labels.to(device)
        
        eval_outputs = model(eval_inputs)
        
        eval_loss = loss_func(eval_outputs, eval_labels)
        
        running_eval_loss += eval_loss
    avg_loss = running_eval_loss / (i + 1)
    return avg_loss

In [11]:
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))

In [12]:
num_epoch = 0

In [13]:
for epoch in range(epochs):
    train_loss = train_one_epoch(num_epoch, writer)
    eval_loss = eval_one_epoch(num_epoch, writer)
    print('LOSS train {} valid {}'.format(train_loss, eval_loss))

    # Log the running loss averaged per batch
    # for both training and validation
    writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : train_loss, 'Validation' : eval_loss },
                    num_epoch + 1)
    writer.flush()

    # Track best performance, and save the model's state
    if train_loss < eval_loss:
        best_vloss = train_loss
        model_path = 'model_{}_{}'.format(timestamp, num_epoch)
        torch.save(model.state_dict(), "./model.pth.tar")

    num_epoch += 1

  batch 1000 loss: 0.9148854283601977
  batch 2000 loss: 0.09839749882125762
  batch 3000 loss: 0.07891434651057352
LOSS train 0.07891434651057352 valid 0.05420494079589844
  batch 1000 loss: 0.05026152493093832
  batch 2000 loss: 0.04862496037019446
  batch 3000 loss: 0.04859779565071949


RuntimeError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 23.70 GiB total capacity; 20.20 GiB already allocated; 3.06 MiB free; 20.35 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF