# Training Playground

In [1]:
import torch
import torch.nn as nn
import utils

from constants import *
from os.path import join
from torch.utils.data import DataLoader
from torchmetrics import Accuracy
from messenger_dataset import ClassificationDataset, PredictionDataset
from models import BaseNet

Hyperparameters

In [2]:
EPOCHS = 5
BATCH_SIZE = 4096
LEARNING_RATE = 1e-3
NUM_WORKERS = 32  # 4 per GPU seems to be a rule of thumb

WINDOW_SIZE = 10

Setup device

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
parallelized = device != "cpu" and torch.cuda.device_count() > 1
print(f"Device: {device}", f"#GPUs: {torch.cuda.device_count()}", sep="\n")

Device: cuda
#GPUs: 8


Ensure reproducability

In [4]:
utils.apply_global_seed(42)

Load training data

In [5]:
ds = ClassificationDataset(join(MERGED_DIR, TRAIN_FILE), window_size=WINDOW_SIZE, partial=True)
train_ds, test_ds = ds.split(0.2)

In [6]:
print(len(train_ds))
print(len(test_ds))

173824
825663


In [7]:
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, pin_memory=True)
test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, pin_memory=True)

Prepare model

In [8]:
model = BaseNet(num_bands=2)
if parallelized:
    model = nn.DataParallel(model)
model.to(device)
print(model)

DataParallel(
  (module): BaseNet(
    (flatten): Flatten(start_dim=-2, end_dim=-1)
    (linear1): Linear(in_features=6, out_features=16, bias=True)
    (relu): ReLU()
    (linear2): Linear(in_features=16, out_features=5, bias=True)
  )
)


Count model parameters

In [9]:
utils.count_trainable_parameters(model)

197

Verify the model resides on GPU

In [10]:
utils.is_model_on_gpu(model)

True

View a sample

In [11]:
sample, label = ds[16604]

print(sample)
print(label)

tensor([[[-0.8939,  0.5042,  0.2548],
         [ 0.0791,  0.7735,  0.9118]],

        [[-0.9656,  0.3747,  0.0987],
         [ 0.0790,  0.7734,  0.9120]],

        [[-0.8503,  0.2446,  0.1417],
         [ 0.0789,  0.7733,  0.9122]],

        [[-0.8005,  0.1914,  0.2459],
         [ 0.0788,  0.7733,  0.9123]],

        [[-0.8505,  0.1135,  0.3782],
         [ 0.0788,  0.7732,  0.9125]],

        [[-0.7557,  0.1629,  0.5743],
         [ 0.0787,  0.7731,  0.9127]],

        [[-0.7590,  0.4532,  0.5694],
         [ 0.0786,  0.7730,  0.9128]],

        [[-0.8666,  0.5920,  0.3623],
         [ 0.0785,  0.7730,  0.9130]],

        [[-0.9471,  0.5594,  0.2053],
         [ 0.0784,  0.7729,  0.9132]],

        [[-0.9124,  0.4999,  0.2535],
         [ 0.0784,  0.7728,  0.9133]]])
tensor([0, 0, 0, 1, 1, 1, 1, 1, 1, 1])


In [12]:
sample = sample.to(device) 
with torch.no_grad():
    out = model(torch.stack([sample]))
print(out)
print(out.shape)

tensor([[[-0.0026,  0.0324,  0.0293,  0.0196,  0.0189, -0.0191, -0.0354,
          -0.0241,  0.0010,  0.0006],
         [ 0.2105,  0.2236,  0.2518,  0.2659,  0.2673,  0.2684,  0.2389,
           0.2026,  0.2001,  0.2092],
         [ 0.1028,  0.1042,  0.1030,  0.1067,  0.1336,  0.1417,  0.1097,
           0.1011,  0.1006,  0.1045],
         [ 0.0757,  0.0679,  0.0654,  0.0688,  0.0673,  0.0743,  0.0897,
           0.0832,  0.0770,  0.0754],
         [ 0.1413,  0.1204,  0.1199,  0.1302,  0.1507,  0.1756,  0.1694,
           0.1552,  0.1371,  0.1417]]], device='cuda:0')
torch.Size([1, 5, 10])


Define optimization evironment

In [13]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
accuracy = Accuracy(num_classes=5).to(device)

Training loop

In [14]:
model.train()

for epoch in range(EPOCHS):
    print(f"-----\nEpoch {epoch + 1}/{EPOCHS}\n-----")
    size = len(train_dl)

    for batch, (X, y) in enumerate(train_dl):
        X = X.to(device)
        y = y.to(device)
  
        pred = model(X)
        loss = criterion(pred, y)
        accuracy(pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 10 == 0:
            loss = loss.item()
            acc = accuracy.compute()
            print(f"loss: {loss}, acc: {acc} [{batch}/{size - 1}]", end="\r")

    loss = loss.item()
    acc = accuracy.compute()
    print(f"loss: {loss}, acc: {acc} [{batch}/{size - 1}]", end="\n")
    
print("DONE.")

Epoch 1/5
-------------------------------
loss: 1.5755622386932373, acc: 0.05391085147857666 [43/43]]
Epoch 2/5
-------------------------------
loss: 1.4127806425094604, acc: 0.17089210450649261 [43/43]
Epoch 3/5
-------------------------------
loss: 1.2549480199813843, acc: 0.26946088671684265 [43/43]
Epoch 4/5
-------------------------------
loss: 1.095296025276184, acc: 0.36582016944885254 [43/43]]
Epoch 5/5
-------------------------------
loss: 0.9346544742584229, acc: 0.44220855832099915 [43/43]
DONE.


Save the model

In [15]:
container = model.module if parallelized else model
torch.save(container.state_dict(), "models/model.pth")

Evaluate the model

In [16]:
model.eval()
acc = Accuracy(num_classes=5).to(device)

with torch.no_grad():
    for X, y in test_dl:
        X = X.to(device)
        y = y.to(device)
        acc(model(X), y)

    print(acc.compute())

tensor(0.7765, device='cuda:0')
