In [1]:
from torch.nn import CrossEntropyLoss
from torch.autograd import set_detect_anomaly
from torch.cuda.amp import autocast, GradScaler
from torch.optim import Adam
from torch.optim.lr_scheduler import LambdaLR
from torch import device, save, load, no_grad, float
from torch.cuda import is_available
from numpy import NINF
from pandas import DataFrame
import time
import os
from IPython.display import clear_output

from models import MultiPerpectiveMixer
from configs import get_MultiPerceptiveMixer_config_224, get_imagenet_config
from input_piplines import get_imagenet_loader

TOTAL_EPOCHS = 200
CSV_DIR = './training_record_224.csv'
CHECKPOINT_DIR = './experiment_checkpoint'
CHECKPOINT_FILE = CHECKPOINT_DIR + '/highest_accuracy_ckpt'

#device = device("cpu")
device = device("cuda" if is_available() else "cpu")
print("Using {} device".format(device))

train_dataloader, test_dataloader = get_imagenet_loader(get_imagenet_config())

model = MultiPerpectiveMixer(get_MultiPerceptiveMixer_config_224()).cuda()
#model.to(device)
set_detect_anomaly(True)

loss_fn = CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=1e-9)
lmbda = lambda epoch: 1.03 ** epoch
scheduler = LambdaLR(optimizer, lr_lambda=lmbda)
scaler = GradScaler()

if not os.path.isfile(CSV_DIR):
    data = DataFrame(columns=['Epoch', 'Validation Loss', 'Validation Accuracy'])
    data.to_csv(CSV_DIR, index=False)
    
def train(dataloader, model, loss_fn, optimizer, scheduler, scaler):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.cuda(), y.cuda()
        
        optimizer.zero_grad()
        
        with autocast():
        # Compute prediction error
            pred = model(X)
            #y = y.squeeze(1).long()
            loss = loss_fn(pred, y)

        # Backpropagation
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scheduler.step()
   
        scaler.update()

        if batch % 10 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"Loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test(dataloader, model, loss_fn, current_epoch, writing_dir=None, sleep_and_clear=True):
    
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    
    with no_grad():
        for X, y in dataloader:
            X, y = X.cuda(), y.cuda()
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(float).sum().item()
            
    test_loss /= num_batches
    correct /= size
    correct = correct*100
    print(f"Test Error: \n Pixel-wise accuracy: {correct}%, Average loss: {test_loss} \n")
    
    if writing_dir is not None:    
        current_data = DataFrame([[current_epoch, correct, test_loss]])
        current_data.to_csv(writing_dir, index=False, mode='a', header=False)
    
    if sleep_and_clear:   
        time.sleep(5)
        clear_output()
    
    return correct
            


#Train new model
if not os.listdir(CHECKPOINT_DIR):
    print("No checkpoint is detected in experiment_checkpoint folder, training from begining")
    init_acc = NINF
    
    for t in range(TOTAL_EPOCHS):
        print(f"Epoch {t+1}\n-------------------------------")
        train(train_dataloader, model, loss_fn, optimizer, scheduler, scaler)
        current_acc = test(test_dataloader, model, loss_fn, t+1, writing_dir=CSV_DIR)
        if current_acc > init_acc:
            init_acc = current_acc
            save({
            'epoch': t+1,
            'model_state_dict': model.state_dict(),
            'testing_accuracy': current_acc
            }, CHECKPOINT_FILE)
    
    save(model, f"./full_model/epoch_{total_epochs}")
            
#Training from checkpoint
else:
    checkpoint = load(CHECKPOINT_FILE)
    print(f"Checkpoint loaded, last epoch is: {checkpoint['epoch']}, testing accuracy: {checkpoint['testing_accuracy']}%")
    model.load_state_dict(checkpoint['model_state_dict'])
    init_acc = NINF
    
    for t in range(checkpoint['epoch'], TOTAL_EPOCHS):
        print(f"Epoch {t+1}\n-------------------------------")
        train(train_dataloader, model, loss_fn, optimizer, scheduler, scaler)
        current_acc = test(test_dataloader, model, loss_fn, t+1, writing_dir=CSV_DIR)
        if current_acc > init_acc:
            init_acc = current_acc
            save({
            'epoch': t+1,
            'model_state_dict': model.state_dict(),
            'testing_accuracy': current_acc
            }, CHECKPOINT_FILE)
    
    save(model, f"./full_model/epoch_{total_epochs}")

Using cuda device
No checkpoint is detected in experiment_checkpoint folder, training from begining
Epoch 1
-------------------------------
Loss: 6.913926  [    0/1281167]
Loss: 6.982344  [ 2000/1281167]
Loss: 6.950527  [ 4000/1281167]
Loss: 6.943144  [ 6000/1281167]
Loss: 6.927519  [ 8000/1281167]
Loss: 6.978594  [10000/1281167]
Loss: 6.925078  [12000/1281167]
Loss: 6.986582  [14000/1281167]
Loss: 6.964375  [16000/1281167]
Loss: 6.936543  [18000/1281167]
Loss: 6.933496  [20000/1281167]
Loss: 6.968066  [22000/1281167]
Loss: 6.946777  [24000/1281167]
Loss: 6.950899  [26000/1281167]
Loss: 6.957930  [28000/1281167]
Loss: 6.970723  [30000/1281167]
Loss: 6.974141  [32000/1281167]
Loss: 6.940137  [34000/1281167]
Loss: 6.910781  [36000/1281167]
Loss: 6.933066  [38000/1281167]
Loss: 6.945762  [40000/1281167]
Loss: 6.937207  [42000/1281167]
Loss: 6.916739  [44000/1281167]
Loss: 6.948301  [46000/1281167]
Loss: 6.957402  [48000/1281167]
Loss: 6.950137  [50000/1281167]
Loss: 6.944473  [52000/12811

  File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/home/ccl/testing/lib/python3.8/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/ccl/testing/lib/python3.8/site-packages/traitlets/config/application.py", line 846, in launch_instance
    app.start()
  File "/home/ccl/testing/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 677, in start
    self.io_loop.start()
  File "/home/ccl/testing/lib/python3.8/site-packages/tornado/platform/asyncio.py", line 199, in start
    self.asyncio_loop.run_forever()
  File "/usr/lib/python3.8/asyncio/base_events.py", line 570, in run_forever
    self._run_once()
  File "/usr/lib/python3.8/asyncio/base_events.py", line 1859, in _run_once
    handle._run()
  File "/usr/lib/python3.8/asyncio/events.py", line 81, in _run
    self._cont

RuntimeError: Function 'LogSoftmaxBackward' returned nan values in its 0th output.

In [9]:
import torch
x = torch.rand(1, 3, 224, 224).cuda()
print(torch.unique(model(x)))

tensor([-1.6141e+00, -1.4625e+00, -1.4219e+00, -1.3742e+00, -1.3620e+00,
        -1.3421e+00, -1.2446e+00, -1.2345e+00, -1.2104e+00, -1.1962e+00,
        -1.1849e+00, -1.1770e+00, -1.1744e+00, -1.1742e+00, -1.1655e+00,
        -1.1442e+00, -1.1314e+00, -1.1074e+00, -1.1010e+00, -1.0823e+00,
        -1.0718e+00, -1.0702e+00, -1.0687e+00, -1.0638e+00, -1.0608e+00,
        -1.0314e+00, -1.0301e+00, -1.0032e+00, -9.9681e-01, -9.9649e-01,
        -9.9272e-01, -9.8591e-01, -9.8571e-01, -9.7906e-01, -9.6867e-01,
        -9.5742e-01, -9.5215e-01, -9.4935e-01, -9.4401e-01, -9.4349e-01,
        -9.3328e-01, -9.2968e-01, -9.2898e-01, -9.2796e-01, -9.2351e-01,
        -9.2131e-01, -9.2039e-01, -9.1767e-01, -9.1317e-01, -9.0841e-01,
        -9.0334e-01, -8.9906e-01, -8.9843e-01, -8.8710e-01, -8.7757e-01,
        -8.7524e-01, -8.7282e-01, -8.7235e-01, -8.7090e-01, -8.6513e-01,
        -8.6499e-01, -8.5844e-01, -8.5405e-01, -8.4897e-01, -8.4709e-01,
        -8.4155e-01, -8.3604e-01, -8.2664e-01, -8.2