In [6]:
import sys
import os
import os.path as osp
sys.path.insert(1, osp.abspath('../'))

In [2]:
from torchvision import datasets
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR

transform=transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
    ])
dataset1 = datasets.MNIST('mnist', train=True, download=True,
                    transform=transform)
dataset2 = datasets.MNIST('mnist', train=False,
                    transform=transform)



In [3]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output

In [4]:
from torch.nn import NLLLoss

model = Net()
optimizer = optim.Adadelta(model.parameters(), lr=0.1)
criterion = NLLLoss()
scheduler = StepLR(optimizer, step_size=1, gamma=0.995)
device = 'cpu'


In [7]:
from torch_trainer.trainer_abstract import TrainingEngineAbstract
class TrainingEngine(TrainingEngineAbstract):
    def setup_train_step(self):
        def train_step(engine, batch):
            self.gnn_model.train()
            self.optimizer.zero_grad()
            x, y = batch[0].to(device), batch[1].to(device)
            y_pred = self.gnn_model(x)
            loss = self.loss_fn(y_pred, y)
            loss.backward()
            self.optimizer.step()
            return loss.item()
        return train_step
    
    def setup_validation_step(self):
        def validation_step(engine, batch):
            self.gnn_model.eval()
            with torch.no_grad():
                x, y = batch[0].to(device), batch[1].to(device)
                y_pred = self.gnn_model(x)
                return y_pred, y
        return validation_step

In [8]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(dataset1, batch_size=64, shuffle=True)
test_dataloader = DataLoader(dataset2, batch_size=64, shuffle=True)

In [9]:
# 6 - Definition  of training engine using our custom object
training_engine = TrainingEngine(train_dataloader, test_dataloader, model, device)
print("8. TrainingEngine instantiated.")

training_engine.setup_trainer(optimizer=optimizer, 
                              criterion=criterion,
                              progress_bar=False)
print("9. Added training function to TrainingEngine.")

training_engine.setup_validation(metrics=["Accuracy", "Precision", "Recall"])
print("10. Added metrics to TrainingEngine.")

training_engine.setup_loggers(log_training=True, log_interval=100)
print("11. Added Loggers to TrainingEngine.")

training_engine.setup_checkpointer(lambda metrics: ((2*metrics['Precision'].mean()*metrics['Recall'].mean())/(metrics['Precision'].mean() + metrics['Recall'].mean())).item())
print("12. Added Checkpointer to TrainingEngine.")

#training_engine.setup_lr_scheduler_with_warmup(lr_scheduler, warmup_start_value=optimizer_config['lr']/10, warmup_end_value=optimizer_config['lr'], warmup_duration=lr_scheduler_config['warmup_duration'])
#print(f"13. Added learning rate scheduler to TrainingEngine with warmup {'enabled' if lr_scheduler_config['warmup_duration'] else 'disabled'}.")

training_engine.setup_tensorboard()
print("14. Tensorboard enabled.")

training_engine.run_training(max_epochs=10)

8. TrainingEngine instantiated.
9. Added training function to TrainingEngine.
10. Added metrics to TrainingEngine.
11. Added Loggers to TrainingEngine.
12. Added Checkpointer to TrainingEngine.
14. Tensorboard enabled.
Epoch[1], Iter[100] Loss: 0.52
Epoch[1], Iter[200] Loss: 0.33
Epoch[1], Iter[300] Loss: 0.30
Epoch[1], Iter[400] Loss: 0.28
Epoch[1], Iter[500] Loss: 0.30
Epoch[1], Iter[600] Loss: 0.19
Epoch[1], Iter[700] Loss: 0.21
Epoch[1], Iter[800] Loss: 0.07
Epoch[1], Iter[900] Loss: 0.19
Train Results - Epoch[1] - Accuracy: 0.97545 - Precision: tensor([0.9915, 0.9821, 0.9773, 0.9733, 0.9789, 0.9680, 0.9767, 0.9729, 0.9673,
        0.9653], dtype=torch.float64) - Recall: tensor([0.9853, 0.9852, 0.9671, 0.9760, 0.9789, 0.9806, 0.9895, 0.9738, 0.9547,
        0.9623], dtype=torch.float64) - loss: 0.08428369140625 - 
Validation Results - Epoch[1] - Accuracy: 0.977 - Precision: tensor([0.9827, 0.9877, 0.9785, 0.9697, 0.9827, 0.9691, 0.9792, 0.9716, 0.9729,
        0.9739], dtype=torch.

In [10]:
%load_ext tensorboard

In [11]:
%tensorboard --logdir tb-logger

ERROR: Failed to launch TensorBoard (exited with 1).
Contents of stderr:
TensorFlow installation not found - running with reduced feature set.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Address already in use
Port 6006 is in use by another program. Either identify and stop that program, or start the server with a different port.
Contents of stdout:

In [12]:
training_engine.get_training_engine()

<ignite.engine.engine.Engine at 0x7f9be14a0490>