# ConvNN Attention Test *** TABLE 2 IN LATEX PAPER
## I. 2D Training for testing with CIFAR100 Dataset

In [1]:
# Torch
import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch import optim 


# Train + Data 
import sys 
sys.path.append('../Layers')
from Conv1d_NN_spatial import * 
from Conv2d_NN_spatial import * 

sys.path.append('../Data')
from CIFAR100 import * 


sys.path.append('../Models')
from CIFAR_experiment_models.Attention import Attention
from CIFAR_experiment_models.Branching import B_Conv2d_ConvNN_K_All, B_Conv2d_ConvNN_K_N, B_Conv2d_ConvNN_Spatial_K_N, B_Conv2d_ConvNN_Attn_K_N, B_Conv2d_ConvNN_Attn_Spatial_K_N, B_Attention_ConvNN_K_All, B_Attention_ConvNN_K_N, B_Attention_ConvNN_Spatial_K_N, B_Attention_ConvNN_Attn_K_N, B_Attention_ConvNN_Attn_Spatial_K_N, B_Attention_Conv2d

from CIFAR_experiment_models.ConvNN import ConvNN_K_All,ConvNN_K_N, ConvNN_Spatial_K_N, ConvNN_Attn_K_N, ConvNN_Attn_Spatial_K_N
from CIFAR_experiment_models.CNN_Control import CNN


sys.path.append('../Train')
from train2d import train_eval, evaluate_accuracy




In [2]:
cifar100 = CIFAR100()

Files already downloaded and verified
Files already downloaded and verified


In [3]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


### 2 Layer Models

In [4]:

# CNN
CNN_2 = CNN(num_layers=2, num_classes=100, device='cuda')

print("Model: " + CNN_2.name)
print("Num params: " + str(count_parameters(CNN_2)))
print()
# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(CNN_2.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(CNN_2, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(CNN_2, cifar100.test_loader, device='cuda')


Model: CNN
Num params: 1641268

Epoch 1, Time: 9.708568811416626, Loss: 3.6475916164915274
Epoch 1, Accuracy: 22.48%
Epoch 2, Time: 9.180623769760132, Loss: 3.0422360564741635
Epoch 2, Accuracy: 26.45%
Epoch 3, Time: 9.116371393203735, Loss: 2.744666891634617
Epoch 3, Accuracy: 28.44%
Epoch 4, Time: 9.090648651123047, Loss: 2.5295733100617936
Epoch 4, Accuracy: 29.86%
Epoch 5, Time: 9.073814392089844, Loss: 2.357496543003775
Epoch 5, Accuracy: 30.63%
Epoch 6, Time: 9.134612560272217, Loss: 2.209010276190765
Epoch 6, Accuracy: 30.82%
Epoch 7, Time: 9.083247423171997, Loss: 2.066005050373809
Epoch 7, Accuracy: 30.31%
Epoch 8, Time: 9.064879179000854, Loss: 1.9350346979277824
Epoch 8, Accuracy: 30.07%
Epoch 9, Time: 9.043972253799438, Loss: 1.8031474490604742
Epoch 9, Accuracy: 30.33%
Epoch 10, Time: 9.003053665161133, Loss: 1.6717659559701106
Epoch 10, Accuracy: 30.24%
Epoch 11, Time: 9.249274969100952, Loss: 1.5377069146127043
Epoch 11, Accuracy: 29.99%
Epoch 12, Time: 9.143856525421143

25.04

In [5]:
# Attention
Attention_2 = Attention(num_layers=2, num_classes=100, device='cuda')

print("Model: " + Attention_2.name)
print("Num params: " + str(count_parameters(Attention_2)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(Attention_2.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(Attention_2, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(Attention_2, cifar100.test_loader, device='cuda')


Model: Attention
Num params: 1677380

Epoch 1, Time: 13.754826068878174, Loss: 4.181052747894736
Epoch 1, Accuracy: 7.88%
Epoch 2, Time: 13.625694513320923, Loss: 3.953985197159945
Epoch 2, Accuracy: 10.84%
Epoch 3, Time: 13.96134877204895, Loss: 3.7899537729790143
Epoch 3, Accuracy: 12.05%
Epoch 4, Time: 13.749596118927002, Loss: 3.6892141027523735
Epoch 4, Accuracy: 13.62%
Epoch 5, Time: 14.14915657043457, Loss: 3.6222369759284017
Epoch 5, Accuracy: 14.82%
Epoch 6, Time: 13.597201108932495, Loss: 3.5646909115564487
Epoch 6, Accuracy: 15.79%
Epoch 7, Time: 13.833448886871338, Loss: 3.5005956662585365
Epoch 7, Accuracy: 17.34%
Epoch 8, Time: 14.09935474395752, Loss: 3.400501570738185
Epoch 8, Accuracy: 19.3%
Epoch 9, Time: 13.784789085388184, Loss: 3.243276505519057
Epoch 9, Accuracy: 20.79%
Epoch 10, Time: 13.325823545455933, Loss: 3.08009828478479
Epoch 10, Accuracy: 22.04%
Epoch 11, Time: 14.030665159225464, Loss: 2.9323913773612293
Epoch 11, Accuracy: 23.13%
Epoch 12, Time: 13.6407

19.9

In [6]:
# ConvNN All 
ConvNN_All_2 = ConvNN_K_All(num_layers=2, num_classes=100, device='cuda')

print("Model: " + ConvNN_All_2.name)
print("Num params: " + str(count_parameters(ConvNN_All_2)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(ConvNN_All_2.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(ConvNN_All_2, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(ConvNN_All_2, cifar100.test_loader, device='cuda')


Model: ConvNN_K_All
Num params: 1683012

Epoch 1, Time: 10.905929803848267, Loss: 3.628987261401418
Epoch 1, Accuracy: 22.43%
Epoch 2, Time: 10.793660640716553, Loss: 3.0402328693653313
Epoch 2, Accuracy: 26.2%
Epoch 3, Time: 10.796435356140137, Loss: 2.657320065266641
Epoch 3, Accuracy: 26.98%
Epoch 4, Time: 11.947954177856445, Loss: 2.34800364157123
Epoch 4, Accuracy: 28.09%
Epoch 5, Time: 12.028072118759155, Loss: 2.0692420277144294
Epoch 5, Accuracy: 28.06%
Epoch 6, Time: 12.071395874023438, Loss: 1.8061432368919978
Epoch 6, Accuracy: 27.33%
Epoch 7, Time: 10.790364742279053, Loss: 1.54864472387087
Epoch 7, Accuracy: 26.7%
Epoch 8, Time: 11.874085903167725, Loss: 1.2949508564246586
Epoch 8, Accuracy: 26.13%
Epoch 9, Time: 11.626371383666992, Loss: 1.0593298024228772
Epoch 9, Accuracy: 25.54%
Epoch 10, Time: 11.82249116897583, Loss: 0.841373010974406
Epoch 10, Accuracy: 24.8%
Epoch 11, Time: 12.455499410629272, Loss: 0.6483585925968102
Epoch 11, Accuracy: 24.37%
Epoch 12, Time: 11.8

23.15

In [7]:
# ConvNN N 
ConvNN_N_2 = ConvNN_K_N(num_layers=2, num_classes=100, device='cuda')

print("Model: " + ConvNN_N_2.name)
print("Num params: " + str(count_parameters(ConvNN_N_2)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(ConvNN_N_2.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(ConvNN_N_2, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(ConvNN_N_2, cifar100.test_loader, device='cuda')


Model: ConvNN_K_N
Num params: 1683012

Epoch 1, Time: 12.350993394851685, Loss: 3.7176890812261636
Epoch 1, Accuracy: 19.76%
Epoch 2, Time: 10.769548416137695, Loss: 3.269682553113269
Epoch 2, Accuracy: 22.71%
Epoch 3, Time: 10.816397905349731, Loss: 3.0442518715358453
Epoch 3, Accuracy: 24.11%
Epoch 4, Time: 12.074772357940674, Loss: 2.8693374313052047
Epoch 4, Accuracy: 25.48%
Epoch 5, Time: 11.137181997299194, Loss: 2.699995085863811
Epoch 5, Accuracy: 26.24%
Epoch 6, Time: 11.01253628730774, Loss: 2.5386212387353257
Epoch 6, Accuracy: 25.86%
Epoch 7, Time: 12.302055597305298, Loss: 2.38072312930051
Epoch 7, Accuracy: 26.27%
Epoch 8, Time: 10.94947075843811, Loss: 2.225473293562984
Epoch 8, Accuracy: 26.06%
Epoch 9, Time: 11.035426139831543, Loss: 2.072364946155597
Epoch 9, Accuracy: 25.43%
Epoch 10, Time: 11.48443603515625, Loss: 1.9135854571981503
Epoch 10, Accuracy: 25.83%
Epoch 11, Time: 11.57016921043396, Loss: 1.761876196050278
Epoch 11, Accuracy: 25.42%
Epoch 12, Time: 11.840

23.57

In [8]:
# ConvNN Spatial N
ConvNN_Spatial_N_2 = ConvNN_Spatial_K_N(num_layers=2, num_classes=100, device='cuda')

print("Model: " + ConvNN_Spatial_N_2.name)
print("Num params: " + str(count_parameters(ConvNN_Spatial_N_2)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(ConvNN_Spatial_N_2.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(ConvNN_Spatial_N_2, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(ConvNN_Spatial_N_2, cifar100.test_loader, device='cuda')


Model: ConvNN_Spatial_K_N
Num params: 1683012

Epoch 1, Time: 12.63396668434143, Loss: 3.7179259455112543
Epoch 1, Accuracy: 20.54%
Epoch 2, Time: 12.655739307403564, Loss: 3.2107434699602444
Epoch 2, Accuracy: 23.89%
Epoch 3, Time: 12.993800401687622, Loss: 2.919352888146325
Epoch 3, Accuracy: 25.45%
Epoch 4, Time: 13.33980417251587, Loss: 2.640486632466621
Epoch 4, Accuracy: 26.04%
Epoch 5, Time: 12.995806455612183, Loss: 2.364900326484914
Epoch 5, Accuracy: 26.13%
Epoch 6, Time: 13.021834373474121, Loss: 2.085971302236133
Epoch 6, Accuracy: 26.59%
Epoch 7, Time: 12.901726961135864, Loss: 1.8095704141785116
Epoch 7, Accuracy: 25.12%
Epoch 8, Time: 13.07844614982605, Loss: 1.5439072689589333
Epoch 8, Accuracy: 24.22%
Epoch 9, Time: 12.923777341842651, Loss: 1.291684947355324
Epoch 9, Accuracy: 23.94%
Epoch 10, Time: 12.861155986785889, Loss: 1.0556377798242642
Epoch 10, Accuracy: 23.44%
Epoch 11, Time: 12.727894067764282, Loss: 0.8466161314941123
Epoch 11, Accuracy: 23.16%
Epoch 12, T

21.82

In [9]:
# ConvNN Attention N
ConvNN_Attn_N_2 = ConvNN_Attn_K_N(num_layers=2, num_classes=100, device='cuda')

print("Model: " + ConvNN_Attn_N_2.name)
print("Num params: " + str(count_parameters(ConvNN_Attn_N_2)))
print()


# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(ConvNN_Attn_N_2.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(ConvNN_Attn_N_2, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(ConvNN_Attn_N_2, cifar100.test_loader, device='cuda')


Model: ConvNN_Attn_K_N
Num params: 2076228

Epoch 1, Time: 12.488734483718872, Loss: 4.096205664107867
Epoch 1, Accuracy: 16.49%
Epoch 2, Time: 12.286926031112671, Loss: 3.408846853639159
Epoch 2, Accuracy: 21.27%
Epoch 3, Time: 10.813698768615723, Loss: 3.144921601275959
Epoch 3, Accuracy: 24.38%
Epoch 4, Time: 10.856118440628052, Loss: 2.958545635118509
Epoch 4, Accuracy: 26.41%
Epoch 5, Time: 11.974373817443848, Loss: 2.790111037166527
Epoch 5, Accuracy: 28.23%
Epoch 6, Time: 11.565882205963135, Loss: 2.6343629497396366
Epoch 6, Accuracy: 29.43%
Epoch 7, Time: 12.772815465927124, Loss: 2.4837708686623734
Epoch 7, Accuracy: 30.7%
Epoch 8, Time: 12.233142852783203, Loss: 2.3287655796541276
Epoch 8, Accuracy: 31.75%
Epoch 9, Time: 11.548106670379639, Loss: 2.181342011522454
Epoch 9, Accuracy: 31.91%
Epoch 10, Time: 11.71213412284851, Loss: 2.033745192658261
Epoch 10, Accuracy: 32.43%
Epoch 11, Time: 12.321818113327026, Loss: 1.8835544432215678
Epoch 11, Accuracy: 31.71%
Epoch 12, Time:

29.36

In [10]:
# ConvNN Attention Spatial N
ConvNN_Attn_Spatial_N_2 = ConvNN_Attn_Spatial_K_N(num_layers=2, num_classes=100, device='cuda')

print("Model: " + ConvNN_Attn_Spatial_N_2.name)
print("Num params: " + str(count_parameters(ConvNN_Attn_Spatial_N_2)))
print()


# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(ConvNN_Attn_Spatial_N_2.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(ConvNN_Attn_Spatial_N_2, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(ConvNN_Attn_Spatial_N_2, cifar100.test_loader, device='cuda')


Model: ConvNN_Attn_Spatial_K_N
Num params: 1953348

Epoch 1, Time: 12.869489908218384, Loss: 3.959251918756139
Epoch 1, Accuracy: 16.36%
Epoch 2, Time: 12.113330125808716, Loss: 3.3882478234713034
Epoch 2, Accuracy: 21.26%
Epoch 3, Time: 11.674383878707886, Loss: 3.145302877096874
Epoch 3, Accuracy: 24.29%
Epoch 4, Time: 13.438963413238525, Loss: 2.9602818409805103
Epoch 4, Accuracy: 24.88%
Epoch 5, Time: 11.617034196853638, Loss: 2.799596360577342
Epoch 5, Accuracy: 27.4%
Epoch 6, Time: 11.75990080833435, Loss: 2.637347312077232
Epoch 6, Accuracy: 27.58%
Epoch 7, Time: 12.168471097946167, Loss: 2.464432661947997
Epoch 7, Accuracy: 28.84%
Epoch 8, Time: 13.860824823379517, Loss: 2.277130076799856
Epoch 8, Accuracy: 28.44%
Epoch 9, Time: 13.718857526779175, Loss: 2.0664628434669026
Epoch 9, Accuracy: 27.84%
Epoch 10, Time: 12.297941446304321, Loss: 1.8428821624697322
Epoch 10, Accuracy: 27.53%
Epoch 11, Time: 13.709681749343872, Loss: 1.5987165010798619
Epoch 11, Accuracy: 26.61%
Epoch 

23.62

#### ii. Branching

In [11]:

# Branching Conv2d + ConvNN All
B_Conv2d_ConvNN_All_2 = B_Conv2d_ConvNN_K_All(num_layers=2, num_classes=100, device='cuda')

print("Model: " + B_Conv2d_ConvNN_All_2.name)
print("Num params: " + str(count_parameters(B_Conv2d_ConvNN_All_2)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Conv2d_ConvNN_All_2.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Conv2d_ConvNN_All_2, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Conv2d_ConvNN_All_2, cifar100.test_loader, device='cuda')


Model: B_Conv2d_ConvNN_K_All
Num params: 1686836

Epoch 1, Time: 12.95442247390747, Loss: 3.7387442570513167
Epoch 1, Accuracy: 21.11%
Epoch 2, Time: 12.891446113586426, Loss: 3.2449614635818755
Epoch 2, Accuracy: 23.43%
Epoch 3, Time: 12.59515929222107, Loss: 3.0323611693004207
Epoch 3, Accuracy: 24.81%
Epoch 4, Time: 11.441560506820679, Loss: 2.850203772030218
Epoch 4, Accuracy: 26.51%
Epoch 5, Time: 12.301801919937134, Loss: 2.660934266531864
Epoch 5, Accuracy: 27.48%
Epoch 6, Time: 11.744993686676025, Loss: 2.4614247868737906
Epoch 6, Accuracy: 27.48%
Epoch 7, Time: 11.570137739181519, Loss: 2.2496705509512624
Epoch 7, Accuracy: 27.98%
Epoch 8, Time: 11.861950397491455, Loss: 2.02285436977206
Epoch 8, Accuracy: 27.92%
Epoch 9, Time: 12.918560981750488, Loss: 1.797090027490845
Epoch 9, Accuracy: 28.01%
Epoch 10, Time: 11.706399917602539, Loss: 1.5690027007361507
Epoch 10, Accuracy: 27.39%
Epoch 11, Time: 12.478705406188965, Loss: 1.3474759382512562
Epoch 11, Accuracy: 27.45%
Epoch 1

24.64

In [12]:

# Branching Conv2d + ConvNN N
B_Conv2d_ConvNN_N_2 = B_Conv2d_ConvNN_K_N(num_layers=2, num_classes=100, device='cuda')

print("Model: " + B_Conv2d_ConvNN_N_2.name)
print("Num params: " + str(count_parameters(B_Conv2d_ConvNN_N_2)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Conv2d_ConvNN_N_2.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Conv2d_ConvNN_N_2, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Conv2d_ConvNN_N_2, cifar100.test_loader, device='cuda')


Model: B_Conv2d_ConvNN_K_N
Num params: 1686836

Epoch 1, Time: 13.396028995513916, Loss: 3.812450338202669
Epoch 1, Accuracy: 18.68%
Epoch 2, Time: 13.971451044082642, Loss: 3.3181920643047906
Epoch 2, Accuracy: 22.26%
Epoch 3, Time: 13.374296426773071, Loss: 3.0909371138228785
Epoch 3, Accuracy: 24.4%
Epoch 4, Time: 14.053441524505615, Loss: 2.908864240207331
Epoch 4, Accuracy: 26.12%
Epoch 5, Time: 13.188318967819214, Loss: 2.7416755067722876
Epoch 5, Accuracy: 27.23%
Epoch 6, Time: 13.7916738986969, Loss: 2.58305152435132
Epoch 6, Accuracy: 27.81%
Epoch 7, Time: 13.265224695205688, Loss: 2.4240783745675443
Epoch 7, Accuracy: 28.49%
Epoch 8, Time: 13.524222373962402, Loss: 2.266210119895008
Epoch 8, Accuracy: 27.87%
Epoch 9, Time: 11.969646215438843, Loss: 2.107209461424357
Epoch 9, Accuracy: 28.43%
Epoch 10, Time: 12.317861557006836, Loss: 1.9457919597625732
Epoch 10, Accuracy: 28.23%
Epoch 11, Time: 13.029403924942017, Loss: 1.78430025778768
Epoch 11, Accuracy: 28.3%
Epoch 12, Time

25.0

In [13]:

# Branching Conv2d + ConvNN Spatial N
B_Conv2d_ConvNN_Spatial_N_2 = B_Conv2d_ConvNN_Spatial_K_N(num_layers=2, num_classes=100, device='cuda')

print("Model: " + B_Conv2d_ConvNN_Spatial_N_2.name)
print("Num params: " + str(count_parameters(B_Conv2d_ConvNN_Spatial_N_2)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Conv2d_ConvNN_Spatial_N_2.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Conv2d_ConvNN_Spatial_N_2, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Conv2d_ConvNN_Spatial_N_2, cifar100.test_loader, device='cuda')


Model: B_Conv2d_ConvNN_Spatial_K_N
Num params: 1686836

Epoch 1, Time: 13.938884973526001, Loss: 3.765363560003393
Epoch 1, Accuracy: 19.43%
Epoch 2, Time: 13.811457872390747, Loss: 3.2674742162684955
Epoch 2, Accuracy: 23.17%
Epoch 3, Time: 14.070659160614014, Loss: 3.0445996593026554
Epoch 3, Accuracy: 24.85%
Epoch 4, Time: 14.507011651992798, Loss: 2.8542358027699657
Epoch 4, Accuracy: 26.44%
Epoch 5, Time: 14.395732164382935, Loss: 2.6655878581659262
Epoch 5, Accuracy: 27.08%
Epoch 6, Time: 14.611167192459106, Loss: 2.4760966647006666
Epoch 6, Accuracy: 27.44%
Epoch 7, Time: 14.679739952087402, Loss: 2.280246428364073
Epoch 7, Accuracy: 28.16%
Epoch 8, Time: 15.771844387054443, Loss: 2.0747346751525275
Epoch 8, Accuracy: 28.17%
Epoch 9, Time: 14.083972215652466, Loss: 1.8644032934132744
Epoch 9, Accuracy: 27.51%
Epoch 10, Time: 15.38857126235962, Loss: 1.6462788510200617
Epoch 10, Accuracy: 27.02%
Epoch 11, Time: 14.688838720321655, Loss: 1.4247810569260737
Epoch 11, Accuracy: 26.5

24.94

In [14]:

# Branching Conv2d + ConvNN Attention N 
B_Conv2d_ConvNN_Attn_N_2 = B_Conv2d_ConvNN_Attn_K_N(num_layers=2, num_classes=100, device='cuda')

print("Model: " + B_Conv2d_ConvNN_Attn_N_2.name)
print("Num params: " + str(count_parameters(B_Conv2d_ConvNN_Attn_N_2)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Conv2d_ConvNN_Attn_N_2.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Conv2d_ConvNN_Attn_N_2, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Conv2d_ConvNN_Attn_N_2, cifar100.test_loader, device='cuda')


Model: B_Conv2d_ConvNN_Attn_K_N
Num params: 2080052

Epoch 1, Time: 11.992396593093872, Loss: 3.9267557792346497
Epoch 1, Accuracy: 17.76%
Epoch 2, Time: 12.03931212425232, Loss: 3.37162739297618
Epoch 2, Accuracy: 22.03%
Epoch 3, Time: 11.919811725616455, Loss: 3.1585483627246163
Epoch 3, Accuracy: 24.35%
Epoch 4, Time: 12.130002498626709, Loss: 3.0072380883614427
Epoch 4, Accuracy: 25.34%
Epoch 5, Time: 12.063891172409058, Loss: 2.873993030899321
Epoch 5, Accuracy: 26.93%
Epoch 6, Time: 12.220947504043579, Loss: 2.7442590071417183
Epoch 6, Accuracy: 28.25%
Epoch 7, Time: 12.049690246582031, Loss: 2.612420347188135
Epoch 7, Accuracy: 28.8%
Epoch 8, Time: 12.177177429199219, Loss: 2.480070977564663
Epoch 8, Accuracy: 29.05%
Epoch 9, Time: 12.079930782318115, Loss: 2.339730625567229
Epoch 9, Accuracy: 29.46%
Epoch 10, Time: 12.117878675460815, Loss: 2.1956874339476875
Epoch 10, Accuracy: 30.86%
Epoch 11, Time: 12.184325456619263, Loss: 2.0449753118597944
Epoch 11, Accuracy: 30.58%
Epoch

28.66

In [12]:

# Branching Conv2d + ConvNN Attention Spatial N 
B_Conv2d_ConvNN_Attn_Spatial_N_2 = B_Conv2d_ConvNN_Attn_Spatial_K_N(num_layers=2, num_classes=100, device='cuda')

print("Model: " + B_Conv2d_ConvNN_Attn_Spatial_N_2.name)
print("Num params: " + str(count_parameters(B_Conv2d_ConvNN_Attn_Spatial_N_2)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Conv2d_ConvNN_Attn_Spatial_N_2.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Conv2d_ConvNN_Attn_Spatial_N_2, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Conv2d_ConvNN_Attn_Spatial_N_2, cifar100.test_loader, device='cuda')


Model: B_Conv2d_ConvNN_Attn_Spatial_K_N
Num params: 1957172

Epoch 1, Time: 12.981253862380981, Loss: 3.907330235556873
Epoch 1, Accuracy: 17.91%
Epoch 2, Time: 12.692564725875854, Loss: 3.3613896217492534
Epoch 2, Accuracy: 21.74%
Epoch 3, Time: 12.723864078521729, Loss: 3.114246733048383
Epoch 3, Accuracy: 24.28%
Epoch 4, Time: 13.201558589935303, Loss: 2.9032464509120075
Epoch 4, Accuracy: 25.66%
Epoch 5, Time: 13.308832883834839, Loss: 2.691174561105421
Epoch 5, Accuracy: 26.92%
Epoch 6, Time: 12.648452758789062, Loss: 2.4615909934348768
Epoch 6, Accuracy: 27.14%
Epoch 7, Time: 12.350528955459595, Loss: 2.2031691499683252
Epoch 7, Accuracy: 26.33%
Epoch 8, Time: 13.319947481155396, Loss: 1.9163128334238095
Epoch 8, Accuracy: 26.09%
Epoch 9, Time: 12.760469198226929, Loss: 1.6030135712660183
Epoch 9, Accuracy: 24.63%
Epoch 10, Time: 13.19772219657898, Loss: 1.2901798136094038
Epoch 10, Accuracy: 24.48%
Epoch 11, Time: 13.020639657974243, Loss: 0.9967280340469097
Epoch 11, Accuracy: 

22.26

In [13]:

# Branching Attention + ConvNN All Samples
B_Attention_ConvNN_All_2 = B_Attention_ConvNN_K_All(num_layers=2, num_classes=100, device='cuda')

print("Model: " + B_Attention_ConvNN_All_2.name)
print("Num params: " + str(count_parameters(B_Attention_ConvNN_All_2)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Attention_ConvNN_All_2.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Attention_ConvNN_All_2, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Attention_ConvNN_All_2, cifar100.test_loader, device='cuda')


Model: B_Attention_ConvNN_K_All
Num params: 1722948

Epoch 1, Time: 16.77768898010254, Loss: 3.7750225838492897
Epoch 1, Accuracy: 20.25%
Epoch 2, Time: 16.3587965965271, Loss: 3.1921863430906137
Epoch 2, Accuracy: 24.52%
Epoch 3, Time: 16.932409286499023, Loss: 2.916668280311253
Epoch 3, Accuracy: 26.82%
Epoch 4, Time: 16.802940607070923, Loss: 2.6877626392542555
Epoch 4, Accuracy: 27.35%
Epoch 5, Time: 16.604969263076782, Loss: 2.473105866890734
Epoch 5, Accuracy: 27.73%
Epoch 6, Time: 15.988385200500488, Loss: 2.2623744837158477
Epoch 6, Accuracy: 27.52%
Epoch 7, Time: 16.209821939468384, Loss: 2.0457285227982895
Epoch 7, Accuracy: 26.81%
Epoch 8, Time: 17.00025486946106, Loss: 1.8227868199805775
Epoch 8, Accuracy: 26.0%
Epoch 9, Time: 16.61495065689087, Loss: 1.6009430396739783
Epoch 9, Accuracy: 25.33%
Epoch 10, Time: 16.169039011001587, Loss: 1.3747095916887073
Epoch 10, Accuracy: 24.52%
Epoch 11, Time: 16.115954160690308, Loss: 1.1561500581024249
Epoch 11, Accuracy: 24.79%
Epoch

22.07

In [14]:

# Branching Attention + ConvNN N Samples
B_Attention_ConvNN_N_2 = B_Attention_ConvNN_K_N(num_layers=2, num_classes=100, device='cuda')

print("Model: " + B_Attention_ConvNN_N_2.name)
print("Num params: " + str(count_parameters(B_Attention_ConvNN_N_2)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Attention_ConvNN_N_2.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Attention_ConvNN_N_2, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Attention_ConvNN_N_2, cifar100.test_loader, device='cuda')


Model: B_Attention_ConvNN_K_N
Num params: 1722948

Epoch 1, Time: 15.931336641311646, Loss: 3.8576344772982782
Epoch 1, Accuracy: 18.82%
Epoch 2, Time: 16.662362098693848, Loss: 3.2937769798366614
Epoch 2, Accuracy: 22.55%
Epoch 3, Time: 16.65518617630005, Loss: 3.051129199964616
Epoch 3, Accuracy: 24.86%
Epoch 4, Time: 16.6078679561615, Loss: 2.872461512570491
Epoch 4, Accuracy: 25.92%
Epoch 5, Time: 16.701481103897095, Loss: 2.7168007802475445
Epoch 5, Accuracy: 26.29%
Epoch 6, Time: 16.617517948150635, Loss: 2.5671544802158386
Epoch 6, Accuracy: 27.15%
Epoch 7, Time: 16.50741410255432, Loss: 2.4196398192659365
Epoch 7, Accuracy: 27.38%
Epoch 8, Time: 16.243282556533813, Loss: 2.2715988572296277
Epoch 8, Accuracy: 26.42%
Epoch 9, Time: 16.249037504196167, Loss: 2.1102368264551967
Epoch 9, Accuracy: 26.46%
Epoch 10, Time: 15.634890794754028, Loss: 1.9569367499607604
Epoch 10, Accuracy: 26.14%
Epoch 11, Time: 17.269031524658203, Loss: 1.7934805962740612
Epoch 11, Accuracy: 25.53%
Epoch

21.57

In [15]:

# Branching Attention + ConvNN Spatial Samples
B_Attention_ConvNN_Spatial_N_2 = B_Attention_ConvNN_Spatial_K_N(num_layers=2, num_classes=100, device='cuda')

print("Model: " + B_Attention_ConvNN_Spatial_N_2.name)
print("Num params: " + str(count_parameters(B_Attention_ConvNN_Spatial_N_2)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Attention_ConvNN_Spatial_N_2.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Attention_ConvNN_Spatial_N_2, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Attention_ConvNN_Spatial_N_2, cifar100.test_loader, device='cuda')


Model: B_Attention_ConvNN_Spatial_K_N
Num params: 1722948

Epoch 1, Time: 17.993830919265747, Loss: 3.829125412589754
Epoch 1, Accuracy: 19.36%
Epoch 2, Time: 18.216216325759888, Loss: 3.3227505866828784
Epoch 2, Accuracy: 22.52%
Epoch 3, Time: 18.013640642166138, Loss: 3.0885702197813925
Epoch 3, Accuracy: 24.41%
Epoch 4, Time: 18.44959282875061, Loss: 2.905865775044922
Epoch 4, Accuracy: 26.01%
Epoch 5, Time: 18.318161725997925, Loss: 2.742250960501259
Epoch 5, Accuracy: 25.65%
Epoch 6, Time: 18.20679807662964, Loss: 2.5795629559575444
Epoch 6, Accuracy: 26.99%
Epoch 7, Time: 18.105055332183838, Loss: 2.4168368600823387
Epoch 7, Accuracy: 26.81%
Epoch 8, Time: 18.137584686279297, Loss: 2.25031214571365
Epoch 8, Accuracy: 26.79%
Epoch 9, Time: 18.055541038513184, Loss: 2.0755251661286023
Epoch 9, Accuracy: 27.41%
Epoch 10, Time: 18.133962631225586, Loss: 1.8959600384278066
Epoch 10, Accuracy: 26.2%
Epoch 11, Time: 17.723148584365845, Loss: 1.7100718352190978
Epoch 11, Accuracy: 25.93%

21.79

In [19]:

# Branching Attention ConvNN Attn N Samples
B_Attention_ConvNN_Attn_N_2 = B_Attention_ConvNN_Attn_K_N(num_layers=2, num_classes=100, device='cuda')

print("Model: " + B_Attention_ConvNN_Attn_N_2.name)
print("Num params: " + str(count_parameters(B_Attention_ConvNN_Attn_N_2)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Attention_ConvNN_Attn_N_2.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Attention_ConvNN_Attn_N_2, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Attention_ConvNN_Attn_N_2, cifar100.test_loader, device='cuda')


Model: B_Attention_ConvNN_Attn_K_N
Num params: 2116164

Epoch 1, Time: 17.219822883605957, Loss: 4.21782623532483
Epoch 1, Accuracy: 9.53%
Epoch 2, Time: 17.397102117538452, Loss: 3.748590219356215
Epoch 2, Accuracy: 13.75%
Epoch 3, Time: 16.64273452758789, Loss: 3.548988599606487
Epoch 3, Accuracy: 15.87%
Epoch 4, Time: 17.41124391555786, Loss: 3.4439987285667675
Epoch 4, Accuracy: 17.48%
Epoch 5, Time: 16.58507490158081, Loss: 3.3661542780259075
Epoch 5, Accuracy: 18.41%
Epoch 6, Time: 16.39076519012451, Loss: 3.2972896693612608
Epoch 6, Accuracy: 19.44%
Epoch 7, Time: 17.286887884140015, Loss: 3.237639359501012
Epoch 7, Accuracy: 20.16%
Epoch 8, Time: 17.195059537887573, Loss: 3.1753318209172514
Epoch 8, Accuracy: 21.64%
Epoch 9, Time: 17.49712038040161, Loss: 3.1121402747185942
Epoch 9, Accuracy: 22.6%
Epoch 10, Time: 17.894779443740845, Loss: 3.060037062296172
Epoch 10, Accuracy: 23.76%
Epoch 11, Time: 17.452917337417603, Loss: 3.008955626536513
Epoch 11, Accuracy: 24.22%
Epoch 12

29.81

In [10]:

# Branching Attention ConvNN Attn N Samples
B_Attention_ConvNN_Attn_Spatial_N_2 = B_Attention_ConvNN_Attn_Spatial_K_N(num_layers=2, num_classes=100, device='cuda')

print("Model: " + B_Attention_ConvNN_Attn_Spatial_N_2.name)
print("Num params: " + str(count_parameters(B_Attention_ConvNN_Attn_Spatial_N_2)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Attention_ConvNN_Attn_Spatial_N_2.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Attention_ConvNN_Attn_Spatial_N_2, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Attention_ConvNN_Attn_Spatial_N_2, cifar100.test_loader, device='cuda')


Model: B_Attention_ConvNN_Attn_Spatial_K_N
Num params: 1993284

Epoch 1, Time: 16.76996898651123, Loss: 4.097000747385537
Epoch 1, Accuracy: 12.56%
Epoch 2, Time: 17.550565004348755, Loss: 3.5882638732490637
Epoch 2, Accuracy: 16.4%
Epoch 3, Time: 17.06922698020935, Loss: 3.3821926629146954
Epoch 3, Accuracy: 19.01%
Epoch 4, Time: 17.008726835250854, Loss: 3.22571722381865
Epoch 4, Accuracy: 21.42%
Epoch 5, Time: 17.300211191177368, Loss: 3.0905144912812412
Epoch 5, Accuracy: 23.35%
Epoch 6, Time: 17.022265672683716, Loss: 2.965782685657901
Epoch 6, Accuracy: 24.71%
Epoch 7, Time: 16.6721773147583, Loss: 2.8440568626994063
Epoch 7, Accuracy: 25.21%
Epoch 8, Time: 16.210525512695312, Loss: 2.721301057149687
Epoch 8, Accuracy: 25.86%
Epoch 9, Time: 17.764682054519653, Loss: 2.5938746229462
Epoch 9, Accuracy: 25.59%
Epoch 10, Time: 16.35917377471924, Loss: 2.4544026219021635
Epoch 10, Accuracy: 25.7%
Epoch 11, Time: 17.002118349075317, Loss: 2.3078092153724805
Epoch 11, Accuracy: 24.85%
E

22.43

In [11]:

# Branching Attention Conv2d
B_Attention_Conv2d_2 = B_Attention_Conv2d(num_layers=2, num_classes=100, device='cuda')

print("Model: " + B_Attention_Conv2d_2.name)
print("Num params: " + str(count_parameters(B_Attention_Conv2d_2)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Attention_Conv2d_2.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Attention_Conv2d_2, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Attention_Conv2d_2, cifar100.test_loader, device='cuda')



Model: B_Attention_Conv2d
Num params: 1681204

Epoch 1, Time: 14.593532800674438, Loss: 3.9339403670157314
Epoch 1, Accuracy: 17.47%
Epoch 2, Time: 14.31766939163208, Loss: 3.3603088227684235
Epoch 2, Accuracy: 22.96%
Epoch 3, Time: 14.325068950653076, Loss: 3.108233864045204
Epoch 3, Accuracy: 25.3%
Epoch 4, Time: 14.816590785980225, Loss: 2.9314765850906177
Epoch 4, Accuracy: 26.61%
Epoch 5, Time: 13.974309921264648, Loss: 2.778320697262464
Epoch 5, Accuracy: 28.15%
Epoch 6, Time: 14.479260206222534, Loss: 2.6280054918030644
Epoch 6, Accuracy: 29.03%
Epoch 7, Time: 15.612007141113281, Loss: 2.47438798124528
Epoch 7, Accuracy: 29.52%
Epoch 8, Time: 14.327069520950317, Loss: 2.3193133552665905
Epoch 8, Accuracy: 30.33%
Epoch 9, Time: 15.047990083694458, Loss: 2.1704670029223117
Epoch 9, Accuracy: 30.66%
Epoch 10, Time: 14.85629916191101, Loss: 2.01676126468517
Epoch 10, Accuracy: 30.88%
Epoch 11, Time: 14.985280513763428, Loss: 1.86267696164758
Epoch 11, Accuracy: 30.59%
Epoch 12, Time

26.88

### 4 Layer Models

In [22]:

# CNN
CNN_4 = CNN(num_layers=4, num_classes=100, device='cuda')

print("Model: " + CNN_4.name)
print("Num params: " + str(count_parameters(CNN_4)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(CNN_4.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(CNN_4, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(CNN_4, cifar100.test_loader, device='cuda')


Model: CNN
Num params: 1645908

Epoch 1, Time: 9.913415908813477, Loss: 3.8128647264617177
Epoch 1, Accuracy: 19.21%
Epoch 2, Time: 10.443635702133179, Loss: 3.3230346971765505
Epoch 2, Accuracy: 22.69%
Epoch 3, Time: 9.98478364944458, Loss: 3.0559823031315716
Epoch 3, Accuracy: 25.95%
Epoch 4, Time: 10.788961172103882, Loss: 2.8489392739732553
Epoch 4, Accuracy: 27.91%
Epoch 5, Time: 10.883997917175293, Loss: 2.68642294223961
Epoch 5, Accuracy: 29.22%
Epoch 6, Time: 10.861596584320068, Loss: 2.535686633471028
Epoch 6, Accuracy: 29.76%
Epoch 7, Time: 9.90657114982605, Loss: 2.3950979779748356
Epoch 7, Accuracy: 30.13%
Epoch 8, Time: 10.749913930892944, Loss: 2.2648898796047394
Epoch 8, Accuracy: 30.7%
Epoch 9, Time: 10.683440208435059, Loss: 2.136365672816401
Epoch 9, Accuracy: 31.09%
Epoch 10, Time: 10.875150918960571, Loss: 2.001832045709995
Epoch 10, Accuracy: 30.39%
Epoch 11, Time: 10.860762119293213, Loss: 1.8718378243543912
Epoch 11, Accuracy: 30.34%
Epoch 12, Time: 10.1808555126

24.48

In [23]:

# Attention
Attention_4 = Attention(num_layers=4, num_classes=100, device='cuda')

print("Model: " + Attention_4.name)
print("Num params: " + str(count_parameters(Attention_4)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(Attention_4.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(Attention_4, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(Attention_4, cifar100.test_loader, device='cuda')


Model: Attention
Num params: 1719588

Epoch 1, Time: 19.112629890441895, Loss: 4.292583580821981
Epoch 1, Accuracy: 5.06%
Epoch 2, Time: 17.882892370224, Loss: 4.118559512335931
Epoch 2, Accuracy: 6.32%
Epoch 3, Time: 17.866214752197266, Loss: 4.04041749742025
Epoch 3, Accuracy: 7.57%
Epoch 4, Time: 18.198973417282104, Loss: 3.9449819524574767
Epoch 4, Accuracy: 9.02%
Epoch 5, Time: 19.660182237625122, Loss: 3.8496504043374222
Epoch 5, Accuracy: 10.36%
Epoch 6, Time: 19.655937433242798, Loss: 3.7930900291408722
Epoch 6, Accuracy: 10.67%
Epoch 7, Time: 19.090874433517456, Loss: 3.7549506166706914
Epoch 7, Accuracy: 11.26%
Epoch 8, Time: 18.70034122467041, Loss: 3.7259830038260926
Epoch 8, Accuracy: 11.49%
Epoch 9, Time: 18.383363008499146, Loss: 3.6996414502868262
Epoch 9, Accuracy: 11.95%
Epoch 10, Time: 18.65878391265869, Loss: 3.6745777651477045
Epoch 10, Accuracy: 12.33%
Epoch 11, Time: 18.989601135253906, Loss: 3.6478456700854287
Epoch 11, Accuracy: 12.35%
Epoch 12, Time: 18.496263

22.48

In [24]:
# ConvNN All 
ConvNN_All_4 = ConvNN_K_All(num_layers=4, num_classes=100, device='cuda')

print("Model: " + ConvNN_All_4.name)
print("Num params: " + str(count_parameters(ConvNN_All_4)))
print()


# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(ConvNN_All_4.parameters(), lr=0.0001)
num_epochs = 100  
train_eval(ConvNN_All_4, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(ConvNN_All_4, cifar100.test_loader, device='cuda')


Model: ConvNN_K_All
Num params: 1757476

Epoch 1, Time: 14.114874124526978, Loss: 3.7124628773735613
Epoch 1, Accuracy: 19.71%
Epoch 2, Time: 13.837431192398071, Loss: 3.239098759563378
Epoch 2, Accuracy: 22.4%
Epoch 3, Time: 13.93636965751648, Loss: 2.9691606784415674
Epoch 3, Accuracy: 25.31%
Epoch 4, Time: 14.074621677398682, Loss: 2.6637856030403197
Epoch 4, Accuracy: 25.72%
Epoch 5, Time: 15.305764198303223, Loss: 2.3237914212829316
Epoch 5, Accuracy: 26.19%
Epoch 6, Time: 14.979799747467041, Loss: 1.941120924699642
Epoch 6, Accuracy: 25.49%
Epoch 7, Time: 14.692272186279297, Loss: 1.550279601226987
Epoch 7, Accuracy: 24.79%
Epoch 8, Time: 14.501447677612305, Loss: 1.166943623510468
Epoch 8, Accuracy: 23.6%
Epoch 9, Time: 14.796847820281982, Loss: 0.819896582904679
Epoch 9, Accuracy: 22.08%
Epoch 10, Time: 14.33766770362854, Loss: 0.5459321364188743
Epoch 10, Accuracy: 21.76%
Epoch 11, Time: 15.016695737838745, Loss: 0.3565301183049026
Epoch 11, Accuracy: 22.11%
Epoch 12, Time: 14

20.52

In [25]:
# ConvNN N 
ConvNN_N_4 = ConvNN_K_N(num_layers=4, num_classes=100, device='cuda')

print("Model: " + ConvNN_N_4.name)
print("Num params: " + str(count_parameters(ConvNN_N_4)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(ConvNN_N_4.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(ConvNN_N_4, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(ConvNN_N_4, cifar100.test_loader, device='cuda')


Model: ConvNN_K_N
Num params: 1757476

Epoch 1, Time: 14.061266899108887, Loss: 3.809881323133893
Epoch 1, Accuracy: 17.94%
Epoch 2, Time: 15.026880741119385, Loss: 3.365958166854156
Epoch 2, Accuracy: 21.44%
Epoch 3, Time: 14.870079517364502, Loss: 3.138091565702882
Epoch 3, Accuracy: 23.31%
Epoch 4, Time: 13.355326414108276, Loss: 2.9523568702170917
Epoch 4, Accuracy: 24.4%
Epoch 5, Time: 13.406812906265259, Loss: 2.7762565868894766
Epoch 5, Accuracy: 24.44%
Epoch 6, Time: 13.744839906692505, Loss: 2.6047225007620614
Epoch 6, Accuracy: 25.28%
Epoch 7, Time: 15.087980270385742, Loss: 2.435201037875222
Epoch 7, Accuracy: 24.67%
Epoch 8, Time: 15.178875923156738, Loss: 2.256959176276956
Epoch 8, Accuracy: 24.62%
Epoch 9, Time: 14.547077894210815, Loss: 2.0818196376571265
Epoch 9, Accuracy: 24.65%
Epoch 10, Time: 13.105954647064209, Loss: 1.8925374503940573
Epoch 10, Accuracy: 23.88%
Epoch 11, Time: 14.009217977523804, Loss: 1.7024129351691517
Epoch 11, Accuracy: 23.45%
Epoch 12, Time: 1

21.09

In [26]:
# ConvNN Spatial N
ConvNN_Spatial_N_4 = ConvNN_Spatial_K_N(num_layers=4, num_classes=100, device='cuda')

print("Model: " + ConvNN_Spatial_N_4.name)
print("Num params: " + str(count_parameters(ConvNN_Spatial_N_4)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(ConvNN_Spatial_N_4.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(ConvNN_Spatial_N_4, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(ConvNN_Spatial_N_4, cifar100.test_loader, device='cuda')


Model: ConvNN_Spatial_K_N
Num params: 1757476

Epoch 1, Time: 17.69166135787964, Loss: 3.7896407976784667
Epoch 1, Accuracy: 18.53%
Epoch 2, Time: 17.442375659942627, Loss: 3.3386828737795504
Epoch 2, Accuracy: 21.5%
Epoch 3, Time: 17.503691911697388, Loss: 3.0797384815752658
Epoch 3, Accuracy: 23.53%
Epoch 4, Time: 17.962941646575928, Loss: 2.8180963645505783
Epoch 4, Accuracy: 24.41%
Epoch 5, Time: 17.357147455215454, Loss: 2.545520980949597
Epoch 5, Accuracy: 24.52%
Epoch 6, Time: 17.5232937335968, Loss: 2.24940131753302
Epoch 6, Accuracy: 24.15%
Epoch 7, Time: 18.01319980621338, Loss: 1.941524571317541
Epoch 7, Accuracy: 23.36%
Epoch 8, Time: 18.01060652732849, Loss: 1.6125551411867751
Epoch 8, Accuracy: 22.28%
Epoch 9, Time: 18.002829551696777, Loss: 1.282716183634975
Epoch 9, Accuracy: 21.59%
Epoch 10, Time: 17.687224626541138, Loss: 0.9775092622355732
Epoch 10, Accuracy: 20.52%
Epoch 11, Time: 18.03979468345642, Loss: 0.7179302450488595
Epoch 11, Accuracy: 19.31%
Epoch 12, Time:

18.46

In [27]:
# ConvNN Attention N
ConvNN_Attn_N_4 = ConvNN_Attn_K_N(num_layers=4, num_classes=100, device='cuda')

print("Model: " + ConvNN_Attn_N_4.name)
print("Num params: " + str(count_parameters(ConvNN_Attn_N_4)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(ConvNN_Attn_N_4.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(ConvNN_Attn_N_4, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(ConvNN_Attn_N_4, cifar100.test_loader, device='cuda')


Model: ConvNN_Attn_K_N
Num params: 2543908

Epoch 1, Time: 15.767952680587769, Loss: 4.484178268696036
Epoch 1, Accuracy: 3.71%
Epoch 2, Time: 14.168016910552979, Loss: 4.113567347721675
Epoch 2, Accuracy: 9.16%
Epoch 3, Time: 16.02595353126526, Loss: 3.8401703297939447
Epoch 3, Accuracy: 11.78%
Epoch 4, Time: 14.3158540725708, Loss: 3.6696767407609983
Epoch 4, Accuracy: 13.44%
Epoch 5, Time: 15.311805486679077, Loss: 3.5660830809332222
Epoch 5, Accuracy: 14.83%
Epoch 6, Time: 15.767626285552979, Loss: 3.4808244452147226
Epoch 6, Accuracy: 15.98%
Epoch 7, Time: 14.67524528503418, Loss: 3.4181718368969305
Epoch 7, Accuracy: 17.37%
Epoch 8, Time: 15.019150972366333, Loss: 3.356366251740614
Epoch 8, Accuracy: 18.33%
Epoch 9, Time: 15.017093896865845, Loss: 3.29973852146617
Epoch 9, Accuracy: 18.45%
Epoch 10, Time: 14.818502426147461, Loss: 3.2457958054359612
Epoch 10, Accuracy: 19.57%
Epoch 11, Time: 15.15554404258728, Loss: 3.1899594315482527
Epoch 11, Accuracy: 20.59%
Epoch 12, Time: 15

29.84

In [28]:
# ConvNN Attention Spatial N
ConvNN_Attn_Spatial_N_4 = ConvNN_Attn_Spatial_K_N(num_layers=4, num_classes=100, device='cuda')

print("Model: " + ConvNN_Attn_Spatial_N_4.name)
print("Num params: " + str(count_parameters(ConvNN_Attn_Spatial_N_4)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(ConvNN_Attn_Spatial_N_4.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(ConvNN_Attn_Spatial_N_4, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(ConvNN_Attn_Spatial_N_4, cifar100.test_loader, device='cuda')


Model: ConvNN_Attn_Spatial_K_N
Num params: 2298148

Epoch 1, Time: 14.9043288230896, Loss: 4.27508704192803
Epoch 1, Accuracy: 9.91%
Epoch 2, Time: 16.311745405197144, Loss: 3.6650761137228183
Epoch 2, Accuracy: 15.36%
Epoch 3, Time: 14.823638200759888, Loss: 3.444148190186152
Epoch 3, Accuracy: 17.76%
Epoch 4, Time: 15.612951517105103, Loss: 3.2984802546098715
Epoch 4, Accuracy: 19.28%
Epoch 5, Time: 15.313313245773315, Loss: 3.1941948321164415
Epoch 5, Accuracy: 21.52%
Epoch 6, Time: 14.69457745552063, Loss: 3.1017087783350052
Epoch 6, Accuracy: 22.02%
Epoch 7, Time: 15.61469030380249, Loss: 3.0175193563446667
Epoch 7, Accuracy: 22.92%
Epoch 8, Time: 15.853182315826416, Loss: 2.9355634158224704
Epoch 8, Accuracy: 23.78%
Epoch 9, Time: 16.40793013572693, Loss: 2.861023695572563
Epoch 9, Accuracy: 24.24%
Epoch 10, Time: 15.234721422195435, Loss: 2.7933759664940405
Epoch 10, Accuracy: 24.73%
Epoch 11, Time: 15.66287112236023, Loss: 2.724231529723653
Epoch 11, Accuracy: 25.5%
Epoch 12, T

24.31

#### ii. Branching

In [6]:
# Branching Conv2d + ConvNN All
B_Conv2d_ConvNN_All_4 = B_Conv2d_ConvNN_K_All(num_layers=4, num_classes=100, device='cuda')

print("Model: " + B_Conv2d_ConvNN_All_4.name)
print("Num params: " + str(count_parameters(B_Conv2d_ConvNN_All_4)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Conv2d_ConvNN_All_4.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Conv2d_ConvNN_All_4, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Conv2d_ConvNN_All_4, cifar100.test_loader, device='cuda')


Model: B_Conv2d_ConvNN_K_All
Num params: 1766996

Epoch 1, Time: 15.8745756149292, Loss: 4.104982127923795
Epoch 1, Accuracy: 16.54%
Epoch 2, Time: 15.551393985748291, Loss: 3.451966056738363
Epoch 2, Accuracy: 20.88%
Epoch 3, Time: 15.608194589614868, Loss: 3.1985494300837405
Epoch 3, Accuracy: 23.32%
Epoch 4, Time: 15.577384948730469, Loss: 2.986068714305263
Epoch 4, Accuracy: 24.69%
Epoch 5, Time: 15.603925704956055, Loss: 2.739744498296772
Epoch 5, Accuracy: 25.76%
Epoch 6, Time: 15.787651062011719, Loss: 2.4510766648880358
Epoch 6, Accuracy: 25.83%
Epoch 7, Time: 15.56513261795044, Loss: 2.142264283983909
Epoch 7, Accuracy: 26.47%
Epoch 8, Time: 16.33477473258972, Loss: 1.8250736153644065
Epoch 8, Accuracy: 25.99%
Epoch 9, Time: 17.485567808151245, Loss: 1.5125316053705142
Epoch 9, Accuracy: 25.36%
Epoch 10, Time: 16.426734924316406, Loss: 1.1944213247360171
Epoch 10, Accuracy: 24.59%
Epoch 11, Time: 15.56213903427124, Loss: 0.8953305058695776
Epoch 11, Accuracy: 23.73%
Epoch 12, 

22.49

In [7]:
# Branching Conv2d + ConvNN N
B_Conv2d_ConvNN_N_4 = B_Conv2d_ConvNN_K_N(num_layers=4, num_classes=100, device='cuda')

print("Model: " + B_Conv2d_ConvNN_N_4.name)
print("Num params: " + str(count_parameters(B_Conv2d_ConvNN_N_4)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Conv2d_ConvNN_N_4.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Conv2d_ConvNN_N_4, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Conv2d_ConvNN_N_4, cifar100.test_loader, device='cuda')


Model: B_Conv2d_ConvNN_K_N
Num params: 1766996

Epoch 1, Time: 15.275499820709229, Loss: 4.118708048635127
Epoch 1, Accuracy: 14.64%
Epoch 2, Time: 14.76788067817688, Loss: 3.4713258996339103
Epoch 2, Accuracy: 20.08%
Epoch 3, Time: 15.205894470214844, Loss: 3.138074562677642
Epoch 3, Accuracy: 22.12%
Epoch 4, Time: 15.498281717300415, Loss: 2.869585864074395
Epoch 4, Accuracy: 23.85%
Epoch 5, Time: 16.118177890777588, Loss: 2.629183508551029
Epoch 5, Accuracy: 23.82%
Epoch 6, Time: 15.688221454620361, Loss: 2.416560955669569
Epoch 6, Accuracy: 24.92%
Epoch 7, Time: 14.90187931060791, Loss: 2.2105080656078466
Epoch 7, Accuracy: 24.96%
Epoch 8, Time: 14.514098644256592, Loss: 2.0041227192829942
Epoch 8, Accuracy: 24.67%
Epoch 9, Time: 14.566611766815186, Loss: 1.798794556304317
Epoch 9, Accuracy: 25.25%
Epoch 10, Time: 14.595385551452637, Loss: 1.5861278747963479
Epoch 10, Accuracy: 24.17%
Epoch 11, Time: 14.665361881256104, Loss: 1.3761666461329936
Epoch 11, Accuracy: 24.53%
Epoch 12, 

22.47

In [8]:
# Branching Conv2d + ConvNN Spatial N
B_Conv2d_ConvNN_Spatial_N_4 = B_Conv2d_ConvNN_Spatial_K_N(num_layers=4, num_classes=100, device='cuda')

print("Model: " + B_Conv2d_ConvNN_Spatial_N_4.name)
print("Num params: " + str(count_parameters(B_Conv2d_ConvNN_Spatial_N_4)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Conv2d_ConvNN_Spatial_N_4.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Conv2d_ConvNN_Spatial_N_4, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Conv2d_ConvNN_Spatial_N_4, cifar100.test_loader, device='cuda')


Model: B_Conv2d_ConvNN_Spatial_K_N
Num params: 1766996

Epoch 1, Time: 19.289708614349365, Loss: 4.043546800113395
Epoch 1, Accuracy: 15.51%
Epoch 2, Time: 18.368623971939087, Loss: 3.501835293782032
Epoch 2, Accuracy: 19.24%
Epoch 3, Time: 18.568432331085205, Loss: 3.282213062276621
Epoch 3, Accuracy: 21.31%
Epoch 4, Time: 18.362096786499023, Loss: 3.081596874519992
Epoch 4, Accuracy: 22.9%
Epoch 5, Time: 18.267926692962646, Loss: 2.8819417868123947
Epoch 5, Accuracy: 24.22%
Epoch 6, Time: 18.577179670333862, Loss: 2.6746135733621506
Epoch 6, Accuracy: 25.79%
Epoch 7, Time: 18.199476957321167, Loss: 2.4631441522132405
Epoch 7, Accuracy: 25.48%
Epoch 8, Time: 18.605370044708252, Loss: 2.2365915488708965
Epoch 8, Accuracy: 25.72%
Epoch 9, Time: 18.721041202545166, Loss: 1.996449827233239
Epoch 9, Accuracy: 25.82%
Epoch 10, Time: 18.900925636291504, Loss: 1.744988139175698
Epoch 10, Accuracy: 25.78%
Epoch 11, Time: 18.648169994354248, Loss: 1.4863066838677887
Epoch 11, Accuracy: 25.15%
E

22.07

In [9]:
# Branching Conv2d + ConvNN Attn N 
B_Conv2d_ConvNN_Attn_N_4 = B_Conv2d_ConvNN_Attn_K_N(num_layers=4, num_classes=100, device='cuda')

print("Model: " + B_Conv2d_ConvNN_Attn_N_4.name)
print("Num params: " + str(count_parameters(B_Conv2d_ConvNN_Attn_N_4)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Conv2d_ConvNN_Attn_N_4.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Conv2d_ConvNN_Attn_N_4, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Conv2d_ConvNN_Attn_N_4, cifar100.test_loader, device='cuda')


Model: B_Conv2d_ConvNN_Attn_K_N
Num params: 2553428

Epoch 1, Time: 15.870471954345703, Loss: 4.520329210154541
Epoch 1, Accuracy: 3.47%
Epoch 2, Time: 15.949507713317871, Loss: 4.091013454415304
Epoch 2, Accuracy: 9.44%
Epoch 3, Time: 15.787334680557251, Loss: 3.694820599482797
Epoch 3, Accuracy: 16.85%
Epoch 4, Time: 16.363008499145508, Loss: 3.4149608093759287
Epoch 4, Accuracy: 19.42%
Epoch 5, Time: 16.84733247756958, Loss: 3.2253642316974336
Epoch 5, Accuracy: 21.22%
Epoch 6, Time: 15.821050643920898, Loss: 3.0868270150230974
Epoch 6, Accuracy: 23.25%
Epoch 7, Time: 16.066306591033936, Loss: 2.966749104697381
Epoch 7, Accuracy: 24.01%
Epoch 8, Time: 16.56924605369568, Loss: 2.861818212377446
Epoch 8, Accuracy: 25.0%
Epoch 9, Time: 15.497479438781738, Loss: 2.7549868721486357
Epoch 9, Accuracy: 25.61%
Epoch 10, Time: 16.791879892349243, Loss: 2.6531390235247208
Epoch 10, Accuracy: 26.62%
Epoch 11, Time: 15.93924617767334, Loss: 2.541119036467179
Epoch 11, Accuracy: 26.43%
Epoch 12,

24.81

In [10]:
# Branching Conv2d + ConvNN Attn Spatial N 
B_Conv2d_ConvNN_Attn_Spatial_N_4 = B_Conv2d_ConvNN_Attn_Spatial_K_N(num_layers=4, num_classes=100, device='cuda')

print("Model: " + B_Conv2d_ConvNN_Attn_Spatial_N_4.name)
print("Num params: " + str(count_parameters(B_Conv2d_ConvNN_Attn_Spatial_N_4)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Conv2d_ConvNN_Attn_Spatial_N_4.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Conv2d_ConvNN_Attn_Spatial_N_4, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Conv2d_ConvNN_Attn_Spatial_N_4, cifar100.test_loader, device='cuda')


Model: B_Conv2d_ConvNN_Attn_Spatial_K_N
Num params: 2307668

Epoch 1, Time: 15.931082487106323, Loss: 4.383434141078569
Epoch 1, Accuracy: 5.18%
Epoch 2, Time: 16.54080367088318, Loss: 4.028227330778566
Epoch 2, Accuracy: 6.98%
Epoch 3, Time: 16.44356346130371, Loss: 3.8787203484484
Epoch 3, Accuracy: 9.41%
Epoch 4, Time: 16.014354944229126, Loss: 3.7253057664007785
Epoch 4, Accuracy: 13.26%
Epoch 5, Time: 16.02636170387268, Loss: 3.5671458597988117
Epoch 5, Accuracy: 15.84%
Epoch 6, Time: 16.139357328414917, Loss: 3.4318580984154625
Epoch 6, Accuracy: 17.26%
Epoch 7, Time: 16.49445605278015, Loss: 3.32534868241576
Epoch 7, Accuracy: 19.31%
Epoch 8, Time: 19.73996090888977, Loss: 3.2335010736494723
Epoch 8, Accuracy: 20.29%
Epoch 9, Time: 17.277642965316772, Loss: 3.1585361070340245
Epoch 9, Accuracy: 21.12%
Epoch 10, Time: 16.251784563064575, Loss: 3.0921504329842375
Epoch 10, Accuracy: 21.85%
Epoch 11, Time: 16.88746213912964, Loss: 3.0337923298711362
Epoch 11, Accuracy: 22.71%
Epoch

26.34

In [11]:

# Branching Attention ConvNN All Samples
B_Attention_ConvNN_All_4 = B_Attention_ConvNN_K_All(num_layers=4, num_classes=100, device='cuda')

print("Model: " + B_Attention_ConvNN_All_4.name)
print("Num params: " + str(count_parameters(B_Attention_ConvNN_All_4)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Attention_ConvNN_All_4.parameters(), lr=0.0001)
num_epochs = 100
train_eval(B_Attention_ConvNN_All_4, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Attention_ConvNN_All_4, cifar100.test_loader, device='cuda')


Model: B_Attention_ConvNN_K_All
Num params: 1840676

Epoch 1, Time: 25.0821795463562, Loss: 4.093193818236251
Epoch 1, Accuracy: 15.98%
Epoch 2, Time: 24.93820834159851, Loss: 3.463303217802511
Epoch 2, Accuracy: 19.2%
Epoch 3, Time: 24.837396144866943, Loss: 3.181046844443397
Epoch 3, Accuracy: 22.13%
Epoch 4, Time: 25.10608434677124, Loss: 2.9526538687288912
Epoch 4, Accuracy: 22.92%
Epoch 5, Time: 24.939591884613037, Loss: 2.744379958228382
Epoch 5, Accuracy: 23.62%
Epoch 6, Time: 26.025081634521484, Loss: 2.5340161692455907
Epoch 6, Accuracy: 23.82%
Epoch 7, Time: 24.660876512527466, Loss: 2.3104475530821955
Epoch 7, Accuracy: 23.71%
Epoch 8, Time: 24.440001249313354, Loss: 2.0839203030556974
Epoch 8, Accuracy: 23.33%
Epoch 9, Time: 24.74945330619812, Loss: 1.8425439937645212
Epoch 9, Accuracy: 22.79%
Epoch 10, Time: 26.222752809524536, Loss: 1.5987980295630062
Epoch 10, Accuracy: 21.91%
Epoch 11, Time: 25.425323009490967, Loss: 1.3578333537596876
Epoch 11, Accuracy: 20.91%
Epoch 1

18.66

In [12]:

# Branching Attention ConvNN N Samples
B_Attention_ConvNN_N_4 = B_Attention_ConvNN_K_N(num_layers=4, num_classes=100, device='cuda')

print("Model: " + B_Attention_ConvNN_N_4.name)
print("Num params: " + str(count_parameters(B_Attention_ConvNN_N_4)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Attention_ConvNN_N_4.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Attention_ConvNN_N_4, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Attention_ConvNN_N_4, cifar100.test_loader, device='cuda')


Model: B_Attention_ConvNN_K_N
Num params: 1840676

Epoch 1, Time: 24.217710971832275, Loss: 4.112449932891085
Epoch 1, Accuracy: 13.85%
Epoch 2, Time: 23.268267393112183, Loss: 3.628841153800945
Epoch 2, Accuracy: 16.54%
Epoch 3, Time: 23.24868369102478, Loss: 3.4198499253338865
Epoch 3, Accuracy: 19.26%
Epoch 4, Time: 23.47615098953247, Loss: 3.221982465985486
Epoch 4, Accuracy: 21.64%
Epoch 5, Time: 23.619384765625, Loss: 3.050976817260313
Epoch 5, Accuracy: 22.21%
Epoch 6, Time: 23.379719734191895, Loss: 2.9051934617864505
Epoch 6, Accuracy: 22.69%
Epoch 7, Time: 23.69638729095459, Loss: 2.7676810437761
Epoch 7, Accuracy: 22.98%
Epoch 8, Time: 23.96615505218506, Loss: 2.633448594366498
Epoch 8, Accuracy: 23.49%
Epoch 9, Time: 23.522014617919922, Loss: 2.5000234080092683
Epoch 9, Accuracy: 23.55%
Epoch 10, Time: 24.16821599006653, Loss: 2.365782712121754
Epoch 10, Accuracy: 23.27%
Epoch 11, Time: 23.617286920547485, Loss: 2.2223445651171456
Epoch 11, Accuracy: 23.67%
Epoch 12, Time: 

20.15

In [13]:

# Branching Attention + ConvNN Spatial Samples
B_Attention_ConvNN_Spatial_N_4 = B_Attention_ConvNN_Spatial_K_N(num_layers=2,  num_classes=100, device='cuda')

print("Model: " + B_Attention_ConvNN_Spatial_N_4.name)
print("Num params: " + str(count_parameters(B_Attention_ConvNN_Spatial_N_4)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Attention_ConvNN_Spatial_N_4.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Attention_ConvNN_Spatial_N_4, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Attention_ConvNN_Spatial_N_4, cifar100.test_loader, device='cuda')


Model: B_Attention_ConvNN_Spatial_K_N
Num params: 1722948

Epoch 1, Time: 17.71440601348877, Loss: 3.8297073405112148
Epoch 1, Accuracy: 18.81%
Epoch 2, Time: 17.323573112487793, Loss: 3.28518399862987
Epoch 2, Accuracy: 22.79%
Epoch 3, Time: 17.581775188446045, Loss: 3.0476389162985567
Epoch 3, Accuracy: 23.49%
Epoch 4, Time: 17.582605361938477, Loss: 2.865800703577983
Epoch 4, Accuracy: 26.18%
Epoch 5, Time: 17.568729162216187, Loss: 2.696068990565932
Epoch 5, Accuracy: 26.15%
Epoch 6, Time: 17.366405725479126, Loss: 2.533804130066386
Epoch 6, Accuracy: 27.16%
Epoch 7, Time: 17.27831244468689, Loss: 2.3673425903710563
Epoch 7, Accuracy: 27.5%
Epoch 8, Time: 17.295659065246582, Loss: 2.1947432911914326
Epoch 8, Accuracy: 26.65%
Epoch 9, Time: 17.513869047164917, Loss: 2.012582001631217
Epoch 9, Accuracy: 26.07%
Epoch 10, Time: 17.521575450897217, Loss: 1.821878317524405
Epoch 10, Accuracy: 26.03%
Epoch 11, Time: 17.790783405303955, Loss: 1.6262650037055735
Epoch 11, Accuracy: 25.05%
E

21.94

In [14]:

# Branching Attention ConvNN Attn N Samples
B_Attention_ConvNN_Attn_N_4 = B_Attention_ConvNN_Attn_K_N(num_layers=4, num_classes=100, device='cuda')

print("Model: " + B_Attention_ConvNN_Attn_N_4.name)
print("Num params: " + str(count_parameters(B_Attention_ConvNN_Attn_N_4)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Attention_ConvNN_Attn_N_4.parameters(), lr=0.0001)
num_epochs = 100
train_eval(B_Attention_ConvNN_Attn_N_4, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Attention_ConvNN_Attn_N_4, cifar100.test_loader, device='cuda')


Model: B_Attention_ConvNN_Attn_K_N
Num params: 2627108

Epoch 1, Time: 24.765066623687744, Loss: 4.400743330531108
Epoch 1, Accuracy: 3.29%
Epoch 2, Time: 24.39499068260193, Loss: 4.1046304327752585
Epoch 2, Accuracy: 8.52%
Epoch 3, Time: 24.253648042678833, Loss: 3.863129519440634
Epoch 3, Accuracy: 10.13%
Epoch 4, Time: 24.15098524093628, Loss: 3.7295321451733483
Epoch 4, Accuracy: 12.55%
Epoch 5, Time: 24.29486584663391, Loss: 3.6319321659214965
Epoch 5, Accuracy: 13.61%
Epoch 6, Time: 26.107141256332397, Loss: 3.5584099472636153
Epoch 6, Accuracy: 14.69%
Epoch 7, Time: 25.717728853225708, Loss: 3.4955205740526205
Epoch 7, Accuracy: 14.99%
Epoch 8, Time: 24.389367818832397, Loss: 3.436358396964305
Epoch 8, Accuracy: 16.5%
Epoch 9, Time: 26.20621371269226, Loss: 3.3859017084321708
Epoch 9, Accuracy: 17.76%
Epoch 10, Time: 24.856075525283813, Loss: 3.3426245795491405
Epoch 10, Accuracy: 18.29%
Epoch 11, Time: 24.21170663833618, Loss: 3.311160497348327
Epoch 11, Accuracy: 18.0%
Epoch 1

23.77

In [4]:

# Branching Attention ConvNN Attn Spatial N Samples
B_Attention_ConvNN_Attn_Spatial_N_4 = B_Attention_ConvNN_Attn_Spatial_K_N(num_layers=4, num_classes=100, device='cuda')

print("Model: " + B_Attention_ConvNN_Attn_Spatial_N_4.name)
print("Num params: " + str(count_parameters(B_Attention_ConvNN_Attn_Spatial_N_4)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Attention_ConvNN_Attn_Spatial_N_4.parameters(), lr=0.0001)
num_epochs = 100
train_eval(B_Attention_ConvNN_Attn_Spatial_N_4, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Attention_ConvNN_Attn_Spatial_N_4, cifar100.test_loader, device='cuda')


Model: B_Attention_ConvNN_Attn_Spatial_K_N
Num params: 2381348

Epoch 1, Time: 27.611769437789917, Loss: 4.309479793319312
Epoch 1, Accuracy: 6.11%
Epoch 2, Time: 25.461337566375732, Loss: 3.94419750777047
Epoch 2, Accuracy: 10.01%
Epoch 3, Time: 24.993144035339355, Loss: 3.7114667075369363
Epoch 3, Accuracy: 12.69%
Epoch 4, Time: 26.133280754089355, Loss: 3.586119617647527
Epoch 4, Accuracy: 14.73%
Epoch 5, Time: 27.175016164779663, Loss: 3.488939049908572
Epoch 5, Accuracy: 15.81%
Epoch 6, Time: 25.355969667434692, Loss: 3.413282773683748
Epoch 6, Accuracy: 16.36%
Epoch 7, Time: 26.62433886528015, Loss: 3.3492067180326224
Epoch 7, Accuracy: 17.37%
Epoch 8, Time: 24.982548236846924, Loss: 3.288070393340362
Epoch 8, Accuracy: 19.07%
Epoch 9, Time: 26.56762170791626, Loss: 3.2348152447844405
Epoch 9, Accuracy: 19.51%
Epoch 10, Time: 25.54464054107666, Loss: 3.1832090582689054
Epoch 10, Accuracy: 20.35%
Epoch 11, Time: 26.526950359344482, Loss: 3.1378785234583004
Epoch 11, Accuracy: 21.2

20.64

In [5]:

# Branching Attention Conv2d
B_Attention_Conv2d_4 = B_Attention_Conv2d(num_layers=4, num_classes=100, device='cuda')

print("Model: " + B_Attention_Conv2d_4.name)
print("Num params: " + str(count_parameters(B_Attention_Conv2d_4)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Attention_Conv2d_4.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Attention_Conv2d_4, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Attention_Conv2d_4, cifar100.test_loader, device='cuda')



Model: B_Attention_Conv2d
Num params: 1729108

Epoch 1, Time: 20.697606563568115, Loss: 4.3977947918045555
Epoch 1, Accuracy: 7.79%
Epoch 2, Time: 20.789127588272095, Loss: 3.75514860012952
Epoch 2, Accuracy: 17.8%
Epoch 3, Time: 20.543806076049805, Loss: 3.372998555907813
Epoch 3, Accuracy: 21.21%
Epoch 4, Time: 20.280441761016846, Loss: 3.1756498557527353
Epoch 4, Accuracy: 23.97%
Epoch 5, Time: 20.860151529312134, Loss: 3.0309141935289974
Epoch 5, Accuracy: 24.56%
Epoch 6, Time: 20.28415060043335, Loss: 2.911746822964505
Epoch 6, Accuracy: 26.37%
Epoch 7, Time: 20.466606378555298, Loss: 2.8005983689252067
Epoch 7, Accuracy: 26.81%
Epoch 8, Time: 20.56850790977478, Loss: 2.6979963802315696
Epoch 8, Accuracy: 26.97%
Epoch 9, Time: 20.914498329162598, Loss: 2.5978133346113705
Epoch 9, Accuracy: 27.23%
Epoch 10, Time: 20.628949880599976, Loss: 2.493485684742403
Epoch 10, Accuracy: 28.02%
Epoch 11, Time: 19.17445206642151, Loss: 2.3867784140969786
Epoch 11, Accuracy: 27.69%
Epoch 12, Tim

25.09

### 8 Layer Models

In [29]:

# CNN
CNN_8 = CNN(num_layers=8, num_classes=100, device='cuda')

print("Model: " + CNN_8.name)
print("Num params: " + str(count_parameters(CNN_8)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(CNN_8.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(CNN_8, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(CNN_8, cifar100.test_loader, device='cuda')


Model: CNN
Num params: 1655188

Epoch 1, Time: 10.830232858657837, Loss: 4.081829472575956
Epoch 1, Accuracy: 13.98%
Epoch 2, Time: 10.992178678512573, Loss: 3.5709984025077137
Epoch 2, Accuracy: 18.86%
Epoch 3, Time: 10.518216848373413, Loss: 3.3097956125693555
Epoch 3, Accuracy: 21.95%
Epoch 4, Time: 10.192702054977417, Loss: 3.03988262546032
Epoch 4, Accuracy: 25.05%
Epoch 5, Time: 11.665210247039795, Loss: 2.7800308042170143
Epoch 5, Accuracy: 26.52%
Epoch 6, Time: 11.00795316696167, Loss: 2.5449850678139025
Epoch 6, Accuracy: 27.83%
Epoch 7, Time: 10.688273191452026, Loss: 2.323829791887337
Epoch 7, Accuracy: 28.1%
Epoch 8, Time: 9.955312252044678, Loss: 2.1044494181947635
Epoch 8, Accuracy: 27.83%
Epoch 9, Time: 10.49341630935669, Loss: 1.8737846698297564
Epoch 9, Accuracy: 27.84%
Epoch 10, Time: 10.493611812591553, Loss: 1.6314271253240689
Epoch 10, Accuracy: 27.38%
Epoch 11, Time: 11.686179161071777, Loss: 1.3770411388038675
Epoch 11, Accuracy: 26.56%
Epoch 12, Time: 11.6913390

21.5

In [30]:

# Attention
Attention_8 = Attention(num_layers=8, num_classes=100, device='cuda')

print("Model: " + Attention_8.name)
print("Num params: " + str(count_parameters(Attention_8)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(Attention_8.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(Attention_8, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(Attention_8, cifar100.test_loader, device='cuda')


Model: Attention
Num params: 1804004

Epoch 1, Time: 27.2875497341156, Loss: 4.605559505160202
Epoch 1, Accuracy: 1.0%
Epoch 2, Time: 27.596441745758057, Loss: 4.605240379757894
Epoch 2, Accuracy: 1.0%
Epoch 3, Time: 27.602852821350098, Loss: 4.605238963271041
Epoch 3, Accuracy: 1.0%
Epoch 4, Time: 27.856520414352417, Loss: 4.60523800227953
Epoch 4, Accuracy: 1.0%
Epoch 5, Time: 28.946688890457153, Loss: 4.605237316902336
Epoch 5, Accuracy: 1.0%
Epoch 6, Time: 28.762919425964355, Loss: 4.605235106499909
Epoch 6, Accuracy: 1.0%
Epoch 7, Time: 28.633395195007324, Loss: 4.605235870536941
Epoch 7, Accuracy: 1.0%
Epoch 8, Time: 27.9434711933136, Loss: 4.605234180874837
Epoch 8, Accuracy: 1.0%
Epoch 9, Time: 28.25147247314453, Loss: 4.605235114426869
Epoch 9, Accuracy: 1.0%
Epoch 10, Time: 27.653608083724976, Loss: 4.605234977229477
Epoch 10, Accuracy: 1.0%
Epoch 11, Time: 28.731383085250854, Loss: 4.605234816860969
Epoch 11, Accuracy: 1.0%
Epoch 12, Time: 28.353584051132202, Loss: 4.6052340

1.0

In [31]:
# ConvNN All 
ConvNN_All_8 = ConvNN_K_All(num_layers=8, num_classes=100, device='cuda')

print("Model: " + ConvNN_All_8.name)
print("Num params: " + str(count_parameters(ConvNN_All_8)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(ConvNN_All_8.parameters(), lr=0.0001)
num_epochs = 100
train_eval(ConvNN_All_8, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(ConvNN_All_8, cifar100.test_loader, device='cuda')


Model: ConvNN_K_All
Num params: 1906404

Epoch 1, Time: 21.20536971092224, Loss: 3.9921079776476107
Epoch 1, Accuracy: 16.07%
Epoch 2, Time: 20.811636209487915, Loss: 3.4673356607442014
Epoch 2, Accuracy: 19.71%
Epoch 3, Time: 21.124303579330444, Loss: 3.2555711302915804
Epoch 3, Accuracy: 21.08%
Epoch 4, Time: 20.52472710609436, Loss: 3.084301024141824
Epoch 4, Accuracy: 22.16%
Epoch 5, Time: 21.047359943389893, Loss: 2.893196702613245
Epoch 5, Accuracy: 22.26%
Epoch 6, Time: 21.03429889678955, Loss: 2.6880133054445468
Epoch 6, Accuracy: 22.05%
Epoch 7, Time: 21.657471895217896, Loss: 2.4557214513459167
Epoch 7, Accuracy: 21.42%
Epoch 8, Time: 21.1993191242218, Loss: 2.2036250944027813
Epoch 8, Accuracy: 21.15%
Epoch 9, Time: 20.33110475540161, Loss: 1.926341896776653
Epoch 9, Accuracy: 21.24%
Epoch 10, Time: 20.99798583984375, Loss: 1.632658453777318
Epoch 10, Accuracy: 20.02%
Epoch 11, Time: 21.375247716903687, Loss: 1.3556911418657474
Epoch 11, Accuracy: 19.49%
Epoch 12, Time: 20.3

17.61

In [32]:
# ConvNN N 
ConvNN_N_8 = ConvNN_K_N(num_layers=8, num_classes=100, device='cuda')

print("Model: " + ConvNN_N_8.name)
print("Num params: " + str(count_parameters(ConvNN_N_8)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(ConvNN_N_8.parameters(), lr=0.0001)
num_epochs = 100
train_eval(ConvNN_N_8, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(ConvNN_N_8, cifar100.test_loader, device='cuda')


Model: ConvNN_K_N
Num params: 1906404

Epoch 1, Time: 18.138893604278564, Loss: 4.040332667053203
Epoch 1, Accuracy: 13.98%
Epoch 2, Time: 18.330541849136353, Loss: 3.5934237866755336
Epoch 2, Accuracy: 17.73%
Epoch 3, Time: 20.33588218688965, Loss: 3.3856293496573366
Epoch 3, Accuracy: 20.4%
Epoch 4, Time: 20.323530912399292, Loss: 3.1985850224409567
Epoch 4, Accuracy: 21.94%
Epoch 5, Time: 18.971343994140625, Loss: 3.0384153124621456
Epoch 5, Accuracy: 22.36%
Epoch 6, Time: 19.430583715438843, Loss: 2.880942982785842
Epoch 6, Accuracy: 22.8%
Epoch 7, Time: 20.397706270217896, Loss: 2.738094828317842
Epoch 7, Accuracy: 24.27%
Epoch 8, Time: 19.11987018585205, Loss: 2.5970991676115927
Epoch 8, Accuracy: 22.88%
Epoch 9, Time: 18.239957332611084, Loss: 2.4563985738303047
Epoch 9, Accuracy: 23.63%
Epoch 10, Time: 19.139272212982178, Loss: 2.3279902918259507
Epoch 10, Accuracy: 23.55%
Epoch 11, Time: 18.62455439567566, Loss: 2.182561291453174
Epoch 11, Accuracy: 23.53%
Epoch 12, Time: 19.8

20.48

In [33]:
# ConvNN Spatial N
ConvNN_Spatial_N_8 = ConvNN_Spatial_K_N(num_layers=8, num_classes=100, device='cuda')

print("Model: " + ConvNN_Spatial_N_8.name)
print("Num params: " + str(count_parameters(ConvNN_Spatial_N_8)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(ConvNN_Spatial_N_8.parameters(), lr=0.0001)
num_epochs = 100
train_eval(ConvNN_Spatial_N_8, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(ConvNN_Spatial_N_8, cifar100.test_loader, device='cuda')


Model: ConvNN_Spatial_K_N
Num params: 1906404

Epoch 1, Time: 27.34671401977539, Loss: 3.9711546117387466
Epoch 1, Accuracy: 15.67%
Epoch 2, Time: 28.170766592025757, Loss: 3.518438233743848
Epoch 2, Accuracy: 18.5%
Epoch 3, Time: 28.3080735206604, Loss: 3.275639012036726
Epoch 3, Accuracy: 20.94%
Epoch 4, Time: 26.608276844024658, Loss: 3.0528606895900445
Epoch 4, Accuracy: 21.3%
Epoch 5, Time: 28.381343603134155, Loss: 2.8385723972564465
Epoch 5, Accuracy: 22.86%
Epoch 6, Time: 26.61758542060852, Loss: 2.59913465708418
Epoch 6, Accuracy: 22.87%
Epoch 7, Time: 26.998996019363403, Loss: 2.336770238931222
Epoch 7, Accuracy: 22.43%
Epoch 8, Time: 27.827314615249634, Loss: 2.0632880888021816
Epoch 8, Accuracy: 21.49%
Epoch 9, Time: 29.770118236541748, Loss: 1.7778121368659427
Epoch 9, Accuracy: 20.36%
Epoch 10, Time: 29.10455632209778, Loss: 1.4878774628309948
Epoch 10, Accuracy: 20.21%
Epoch 11, Time: 27.67052984237671, Loss: 1.2196038263990445
Epoch 11, Accuracy: 18.78%
Epoch 12, Time: 

17.27

In [34]:
# ConvNN Attention N
ConvNN_Attn_N_8 = ConvNN_Attn_K_N(num_layers=8, num_classes=100, device='cuda')

print("Model: " + ConvNN_Attn_N_8.name)
print("Num params: " + str(count_parameters(ConvNN_Attn_N_8)))  
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(ConvNN_Attn_N_8.parameters(), lr=0.0001)
num_epochs = 100
train_eval(ConvNN_Attn_N_8, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(ConvNN_Attn_N_8, cifar100.test_loader, device='cuda')


Model: ConvNN_Attn_K_N
Num params: 3479268

Epoch 1, Time: 21.316924810409546, Loss: 4.605918723908837
Epoch 1, Accuracy: 1.0%
Epoch 2, Time: 20.957191944122314, Loss: 4.605255970259762
Epoch 2, Accuracy: 0.99%
Epoch 3, Time: 22.12774658203125, Loss: 4.605285384770855
Epoch 3, Accuracy: 0.97%
Epoch 4, Time: 22.68594765663147, Loss: 4.605237061410304
Epoch 4, Accuracy: 1.03%
Epoch 5, Time: 21.122487545013428, Loss: 4.6052464329068314
Epoch 5, Accuracy: 1.01%
Epoch 6, Time: 20.373276472091675, Loss: 4.605242001126184
Epoch 6, Accuracy: 1.01%
Epoch 7, Time: 20.554171562194824, Loss: 4.462812988349544
Epoch 7, Accuracy: 2.79%
Epoch 8, Time: 20.792261600494385, Loss: 4.277144133892206
Epoch 8, Accuracy: 3.41%
Epoch 9, Time: 20.446327924728394, Loss: 4.224821716623233
Epoch 9, Accuracy: 3.92%
Epoch 10, Time: 21.084929943084717, Loss: 4.1912495626513
Epoch 10, Accuracy: 3.84%
Epoch 11, Time: 20.80361247062683, Loss: 4.159541689526394
Epoch 11, Accuracy: 4.95%
Epoch 12, Time: 21.04009079933166

26.15

In [35]:
# ConvNN Attention N
ConvNN_Attn_Spatial_N_8 = ConvNN_Attn_Spatial_K_N(num_layers=8, num_classes=100, device='cuda')

print("Model: " + ConvNN_Attn_Spatial_N_8.name)
print("Num params: " + str(count_parameters(ConvNN_Attn_Spatial_N_8)))  
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(ConvNN_Attn_Spatial_N_8.parameters(), lr=0.0001)
num_epochs = 100
train_eval(ConvNN_Attn_Spatial_N_8, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(ConvNN_Attn_Spatial_N_8, cifar100.test_loader, device='cuda')


Model: ConvNN_Attn_Spatial_K_N
Num params: 2987748

Epoch 1, Time: 22.249722480773926, Loss: 4.605719430367355
Epoch 1, Accuracy: 0.9%
Epoch 2, Time: 20.234468460083008, Loss: 4.60524055780962
Epoch 2, Accuracy: 1.06%
Epoch 3, Time: 21.051600694656372, Loss: 4.605246282294583
Epoch 3, Accuracy: 0.95%
Epoch 4, Time: 21.99337100982666, Loss: 4.476197656768058
Epoch 4, Accuracy: 3.14%
Epoch 5, Time: 21.887527465820312, Loss: 4.221426249465065
Epoch 5, Accuracy: 3.67%
Epoch 6, Time: 21.576930284500122, Loss: 4.157305475086202
Epoch 6, Accuracy: 3.91%
Epoch 7, Time: 21.741594076156616, Loss: 4.1173345670675685
Epoch 7, Accuracy: 4.38%
Epoch 8, Time: 21.698410034179688, Loss: 4.08527955740614
Epoch 8, Accuracy: 4.41%
Epoch 9, Time: 21.225661754608154, Loss: 4.0573235792882
Epoch 9, Accuracy: 4.96%
Epoch 10, Time: 21.71231198310852, Loss: 4.035748643338528
Epoch 10, Accuracy: 5.1%
Epoch 11, Time: 21.83458113670349, Loss: 3.9824106772537426
Epoch 11, Accuracy: 6.06%
Epoch 12, Time: 21.50594162

19.11

#### ii. Branching

In [6]:

# Branching Conv2d + ConvNN All
B_Conv2d_ConvNN_All_8 = B_Conv2d_ConvNN_K_All(num_layers=8, num_classes=100, device='cuda')

print("Model: " + B_Conv2d_ConvNN_All_8.name)
print("Num params: " + str(count_parameters(B_Conv2d_ConvNN_All_8)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Conv2d_ConvNN_All_8.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Conv2d_ConvNN_All_8, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Conv2d_ConvNN_All_8, cifar100.test_loader, device='cuda')


Model: B_Conv2d_ConvNN_K_All
Num params: 1927316

Epoch 1, Time: 23.428720235824585, Loss: 4.281858599399362
Epoch 1, Accuracy: 12.31%
Epoch 2, Time: 23.1480929851532, Loss: 3.5958352320639375
Epoch 2, Accuracy: 15.65%
Epoch 3, Time: 23.340792179107666, Loss: 3.1453341340165
Epoch 3, Accuracy: 18.05%
Epoch 4, Time: 23.62478995323181, Loss: 2.742061623831844
Epoch 4, Accuracy: 18.72%
Epoch 5, Time: 23.2292537689209, Loss: 2.3212524612846277
Epoch 5, Accuracy: 18.6%
Epoch 6, Time: 22.953891277313232, Loss: 1.8881318398448816
Epoch 6, Accuracy: 18.13%
Epoch 7, Time: 23.676573276519775, Loss: 1.479681179270415
Epoch 7, Accuracy: 17.46%
Epoch 8, Time: 23.802406787872314, Loss: 1.1415501779607495
Epoch 8, Accuracy: 17.46%
Epoch 9, Time: 22.932091236114502, Loss: 0.8678486589199442
Epoch 9, Accuracy: 17.14%
Epoch 10, Time: 23.08558964729309, Loss: 0.6534644492599361
Epoch 10, Accuracy: 16.56%
Epoch 11, Time: 23.812196731567383, Loss: 0.4999702311956974
Epoch 11, Accuracy: 16.58%
Epoch 12, Tim

16.42

In [7]:

# Branching Conv2d + ConvNN N
B_Conv2d_ConvNN_N_8=B_Conv2d_ConvNN_K_N(num_layers=8, num_classes=100, device='cuda')

print("Model: " + B_Conv2d_ConvNN_N_8.name)
print("Num params: " + str(count_parameters(B_Conv2d_ConvNN_N_8)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Conv2d_ConvNN_N_8.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Conv2d_ConvNN_N_8, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Conv2d_ConvNN_N_8, cifar100.test_loader, device='cuda')


Model: B_Conv2d_ConvNN_K_N
Num params: 1927316

Epoch 1, Time: 22.123785495758057, Loss: 4.3074348198483365
Epoch 1, Accuracy: 11.13%
Epoch 2, Time: 23.37285351753235, Loss: 3.720180713002334
Epoch 2, Accuracy: 16.16%
Epoch 3, Time: 23.25489354133606, Loss: 3.374941667937257
Epoch 3, Accuracy: 18.68%
Epoch 4, Time: 21.666727542877197, Loss: 3.1053674949709413
Epoch 4, Accuracy: 18.72%
Epoch 5, Time: 22.278958082199097, Loss: 2.8745582289707934
Epoch 5, Accuracy: 20.84%
Epoch 6, Time: 22.830638885498047, Loss: 2.6394331048211783
Epoch 6, Accuracy: 21.17%
Epoch 7, Time: 21.77652931213379, Loss: 2.3962257443486576
Epoch 7, Accuracy: 21.83%
Epoch 8, Time: 23.15491795539856, Loss: 2.1290258141734717
Epoch 8, Accuracy: 21.85%
Epoch 9, Time: 23.27183771133423, Loss: 1.8635209708872353
Epoch 9, Accuracy: 22.33%
Epoch 10, Time: 23.131677627563477, Loss: 1.5931991753370867
Epoch 10, Accuracy: 22.21%
Epoch 11, Time: 21.986565351486206, Loss: 1.3394132796150948
Epoch 11, Accuracy: 21.49%
Epoch 12,

21.83

In [8]:

# Branching Conv2d + ConvNN Spatial N
B_Conv2d_ConvNN_Spatial_N_8 = B_Conv2d_ConvNN_Spatial_K_N(num_layers=8, num_classes=100, device='cuda')

print("Model: " + B_Conv2d_ConvNN_Spatial_N_8.name)
print("Num params: " + str(count_parameters(B_Conv2d_ConvNN_Spatial_N_8)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Conv2d_ConvNN_Spatial_N_8.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Conv2d_ConvNN_Spatial_N_8, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Conv2d_ConvNN_Spatial_N_8, cifar100.test_loader, device='cuda')


Model: B_Conv2d_ConvNN_Spatial_K_N
Num params: 1927316

Epoch 1, Time: 32.71655321121216, Loss: 4.2908499920764545
Epoch 1, Accuracy: 11.58%
Epoch 2, Time: 32.82429909706116, Loss: 3.729817672458756
Epoch 2, Accuracy: 15.94%
Epoch 3, Time: 32.390552282333374, Loss: 3.4240256752199527
Epoch 3, Accuracy: 17.53%
Epoch 4, Time: 32.72279238700867, Loss: 3.128382369685356
Epoch 4, Accuracy: 20.29%
Epoch 5, Time: 32.4604606628418, Loss: 2.82505750168315
Epoch 5, Accuracy: 21.32%
Epoch 6, Time: 32.57600259780884, Loss: 2.538837254962043
Epoch 6, Accuracy: 20.91%
Epoch 7, Time: 32.37504816055298, Loss: 2.2392712762898497
Epoch 7, Accuracy: 21.02%
Epoch 8, Time: 31.890569925308228, Loss: 1.9207137248400228
Epoch 8, Accuracy: 20.56%
Epoch 9, Time: 31.559861660003662, Loss: 1.5918329087517145
Epoch 9, Accuracy: 20.28%
Epoch 10, Time: 32.26540946960449, Loss: 1.2675989374632726
Epoch 10, Accuracy: 19.97%
Epoch 11, Time: 31.936299562454224, Loss: 0.9684044320107726
Epoch 11, Accuracy: 19.63%
Epoch 1

18.85

In [9]:

# Branching Conv2d + ConvNN Attention N 
B_Conv2d_ConvNN_Attn_N_8 = B_Conv2d_ConvNN_Attn_K_N(num_layers=8, num_classes=100, device='cuda')

print("Model: " + B_Conv2d_ConvNN_Attn_N_8.name)
print("Num params: " + str(count_parameters(B_Conv2d_ConvNN_Attn_N_8)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Conv2d_ConvNN_Attn_N_8.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Conv2d_ConvNN_Attn_N_8, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Conv2d_ConvNN_Attn_N_8, cifar100.test_loader, device='cuda')


Model: B_Conv2d_ConvNN_Attn_K_N
Num params: 3500180

Epoch 1, Time: 24.65899419784546, Loss: 4.607312045743703
Epoch 1, Accuracy: 1.02%
Epoch 2, Time: 25.43879532814026, Loss: 4.5575150528832165
Epoch 2, Accuracy: 2.72%
Epoch 3, Time: 23.735661506652832, Loss: 4.23736530374688
Epoch 3, Accuracy: 4.2%
Epoch 4, Time: 23.40263557434082, Loss: 4.167944112702099
Epoch 4, Accuracy: 4.33%
Epoch 5, Time: 23.96353530883789, Loss: 4.1137596100492555
Epoch 5, Accuracy: 5.34%
Epoch 6, Time: 23.38595461845398, Loss: 4.000041002202827
Epoch 6, Accuracy: 6.71%
Epoch 7, Time: 22.5738046169281, Loss: 3.919440186542013
Epoch 7, Accuracy: 7.32%
Epoch 8, Time: 24.205780029296875, Loss: 3.861964437967676
Epoch 8, Accuracy: 8.9%
Epoch 9, Time: 23.73667550086975, Loss: 3.8256180807757563
Epoch 9, Accuracy: 9.24%
Epoch 10, Time: 24.697384357452393, Loss: 3.8024627897135743
Epoch 10, Accuracy: 9.25%
Epoch 11, Time: 23.511805295944214, Loss: 3.778026778069908
Epoch 11, Accuracy: 10.35%
Epoch 12, Time: 25.220663

25.99

In [10]:

# Branching Conv2d + ConvNN Attention Spatial N 
B_Conv2d_ConvNN_Attn_Spatial_N_8 = B_Conv2d_ConvNN_Attn_Spatial_K_N(num_layers=8, num_classes=100, device='cuda')

print("Model: " + B_Conv2d_ConvNN_Attn_Spatial_N_8.name)
print("Num params: " + str(count_parameters(B_Conv2d_ConvNN_Attn_Spatial_N_8)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Conv2d_ConvNN_Attn_Spatial_N_8.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Conv2d_ConvNN_Attn_Spatial_N_8, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Conv2d_ConvNN_Attn_Spatial_N_8, cifar100.test_loader, device='cuda')


Model: B_Conv2d_ConvNN_Attn_Spatial_K_N
Num params: 3008660

Epoch 1, Time: 25.078577041625977, Loss: 4.548465578147518
Epoch 1, Accuracy: 2.89%
Epoch 2, Time: 25.79613471031189, Loss: 4.218447228526825
Epoch 2, Accuracy: 4.45%
Epoch 3, Time: 23.5074462890625, Loss: 4.106695887987571
Epoch 3, Accuracy: 6.03%
Epoch 4, Time: 23.380475759506226, Loss: 3.9891736342778903
Epoch 4, Accuracy: 7.05%
Epoch 5, Time: 24.901390552520752, Loss: 3.916127626548338
Epoch 5, Accuracy: 8.06%
Epoch 6, Time: 22.787665605545044, Loss: 3.8646726614374027
Epoch 6, Accuracy: 8.04%
Epoch 7, Time: 24.33418583869934, Loss: 3.830144675186528
Epoch 7, Accuracy: 9.01%
Epoch 8, Time: 24.974732875823975, Loss: 3.7943236785166707
Epoch 8, Accuracy: 9.46%
Epoch 9, Time: 25.752909421920776, Loss: 3.7611304948397
Epoch 9, Accuracy: 10.18%
Epoch 10, Time: 25.517377853393555, Loss: 3.7318538198690585
Epoch 10, Accuracy: 10.71%
Epoch 11, Time: 23.425118923187256, Loss: 3.696394022468411
Epoch 11, Accuracy: 11.07%
Epoch 12, 

18.16

In [4]:

# Branching Attention + ConvNN All Samples
B_Attention_ConvNN_All_8 = B_Attention_ConvNN_K_All(num_layers=8, num_classes=100, device='cuda')

print("Model: " + B_Attention_ConvNN_All_8.name)
print("Num params: " + str(count_parameters(B_Attention_ConvNN_All_8)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Attention_ConvNN_All_8.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Attention_ConvNN_All_8, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Attention_ConvNN_All_8, cifar100.test_loader, device='cuda')


Model: B_Attention_ConvNN_K_All
Num params: 2076132

Epoch 1, Time: 42.78837847709656, Loss: 4.37866805032696
Epoch 1, Accuracy: 10.98%
Epoch 2, Time: 41.459611892700195, Loss: 3.7201251873884664
Epoch 2, Accuracy: 15.41%
Epoch 3, Time: 41.99383068084717, Loss: 3.4824850684236686
Epoch 3, Accuracy: 17.42%
Epoch 4, Time: 42.13182306289673, Loss: 3.322387262073624
Epoch 4, Accuracy: 18.3%
Epoch 5, Time: 42.166407108306885, Loss: 3.189154283774783
Epoch 5, Accuracy: 19.24%
Epoch 6, Time: 42.1345636844635, Loss: 3.063927794051597
Epoch 6, Accuracy: 19.08%
Epoch 7, Time: 42.24527192115784, Loss: 2.9284812522971113
Epoch 7, Accuracy: 19.79%
Epoch 8, Time: 42.15994358062744, Loss: 2.7929148497179037
Epoch 8, Accuracy: 20.03%
Epoch 9, Time: 42.294593811035156, Loss: 2.646330843343759
Epoch 9, Accuracy: 19.39%
Epoch 10, Time: 42.59058141708374, Loss: 2.4833685222184263
Epoch 10, Accuracy: 20.39%
Epoch 11, Time: 42.08353042602539, Loss: 2.3182443798045673
Epoch 11, Accuracy: 19.76%
Epoch 12, Tim

16.95

In [5]:

# Branching Attention + ConvNN N Samples
B_Attention_ConvNN_N_8 = B_Attention_ConvNN_K_N(num_layers=8, num_classes=100, device='cuda')

print("Model: " + B_Attention_ConvNN_N_8.name)
print("Num params: " + str(count_parameters(B_Attention_ConvNN_N_8)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Attention_ConvNN_N_8.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Attention_ConvNN_N_8, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Attention_ConvNN_N_8, cifar100.test_loader, device='cuda')


Model: B_Attention_ConvNN_K_N
Num params: 2076132

Epoch 1, Time: 40.35176420211792, Loss: 4.357340590728214
Epoch 1, Accuracy: 8.39%
Epoch 2, Time: 39.151432037353516, Loss: 3.8646117128679514
Epoch 2, Accuracy: 13.54%
Epoch 3, Time: 39.52498483657837, Loss: 3.675698105331577
Epoch 3, Accuracy: 15.1%
Epoch 4, Time: 40.10363173484802, Loss: 3.543713039449414
Epoch 4, Accuracy: 15.83%
Epoch 5, Time: 40.760074615478516, Loss: 3.4333597927752053
Epoch 5, Accuracy: 16.86%
Epoch 6, Time: 40.19130992889404, Loss: 3.3256584918102643
Epoch 6, Accuracy: 17.47%
Epoch 7, Time: 38.64905261993408, Loss: 3.2245865761471526
Epoch 7, Accuracy: 18.18%
Epoch 8, Time: 39.3338942527771, Loss: 3.1217342907815335
Epoch 8, Accuracy: 18.47%
Epoch 9, Time: 40.53194332122803, Loss: 3.0138638266517073
Epoch 9, Accuracy: 18.53%
Epoch 10, Time: 38.539165019989014, Loss: 2.902512136627646
Epoch 10, Accuracy: 18.91%
Epoch 11, Time: 39.71992349624634, Loss: 2.792842775659488
Epoch 11, Accuracy: 19.63%
Epoch 12, Time:

16.73

In [6]:

# Branching Attention + ConvNN Spatial Samples
B_Attention_ConvNN_Spatial_N_8 = B_Attention_ConvNN_Spatial_K_N(num_layers=8, num_classes=100, device='cuda')

print("Model: " + B_Attention_ConvNN_Spatial_N_8.name)
print("Num params: " + str(count_parameters(B_Attention_ConvNN_Spatial_N_8)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Attention_ConvNN_Spatial_N_8.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Attention_ConvNN_Spatial_N_8, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Attention_ConvNN_Spatial_N_8, cifar100.test_loader, device='cuda')


Model: B_Attention_ConvNN_Spatial_K_N
Num params: 2076132

Epoch 1, Time: 47.01346230506897, Loss: 4.402839050268578
Epoch 1, Accuracy: 10.72%
Epoch 2, Time: 47.16226243972778, Loss: 3.7884137853027307
Epoch 2, Accuracy: 14.94%
Epoch 3, Time: 47.22380328178406, Loss: 3.5345672033631894
Epoch 3, Accuracy: 17.19%
Epoch 4, Time: 46.56236791610718, Loss: 3.367647219192037
Epoch 4, Accuracy: 18.28%
Epoch 5, Time: 46.99730563163757, Loss: 3.234150301160105
Epoch 5, Accuracy: 19.1%
Epoch 6, Time: 46.34465575218201, Loss: 3.109001764251143
Epoch 6, Accuracy: 18.98%
Epoch 7, Time: 46.30579423904419, Loss: 2.9763618137525474
Epoch 7, Accuracy: 18.76%
Epoch 8, Time: 47.24799728393555, Loss: 2.8488756375544515
Epoch 8, Accuracy: 18.31%
Epoch 9, Time: 46.81059741973877, Loss: 2.7134539757848093
Epoch 9, Accuracy: 18.71%
Epoch 10, Time: 46.35852289199829, Loss: 2.570150625522789
Epoch 10, Accuracy: 18.22%
Epoch 11, Time: 46.509584188461304, Loss: 2.418926392369868
Epoch 11, Accuracy: 17.93%
Epoch 12

14.86

In [7]:

# Branching Attention ConvNN Attn N Samples
B_Attention_ConvNN_Attn_N_8 = B_Attention_ConvNN_Attn_K_N(num_layers=8, num_classes=100, device='cuda')

print("Model: " + B_Attention_ConvNN_Attn_N_8.name)
print("Num params: " + str(count_parameters(B_Attention_ConvNN_Attn_N_8)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Attention_ConvNN_Attn_N_8.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Attention_ConvNN_Attn_N_8, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Attention_ConvNN_Attn_N_8, cifar100.test_loader, device='cuda')


Model: B_Attention_ConvNN_Attn_K_N
Num params: 3648996

Epoch 1, Time: 41.34752321243286, Loss: 4.608925615735067
Epoch 1, Accuracy: 0.88%
Epoch 2, Time: 40.87327766418457, Loss: 4.582518955020953
Epoch 2, Accuracy: 2.26%
Epoch 3, Time: 41.207908630371094, Loss: 4.295736515308585
Epoch 3, Accuracy: 3.73%
Epoch 4, Time: 40.40105104446411, Loss: 4.179699282512031
Epoch 4, Accuracy: 4.51%
Epoch 5, Time: 41.01796817779541, Loss: 4.1286083594002685
Epoch 5, Accuracy: 5.01%
Epoch 6, Time: 40.72020101547241, Loss: 4.090508497889389
Epoch 6, Accuracy: 5.08%
Epoch 7, Time: 40.992631673812866, Loss: 4.054497623992393
Epoch 7, Accuracy: 5.79%
Epoch 8, Time: 39.94703936576843, Loss: 4.0273273454602725
Epoch 8, Accuracy: 5.85%
Epoch 9, Time: 40.88292145729065, Loss: 3.9827961875959432
Epoch 9, Accuracy: 6.34%
Epoch 10, Time: 40.340638160705566, Loss: 3.934138523648157
Epoch 10, Accuracy: 7.8%
Epoch 11, Time: 42.50271797180176, Loss: 3.88467544271513
Epoch 11, Accuracy: 8.8%
Epoch 12, Time: 40.45371

19.16

In [8]:

# Branching Attention ConvNN Attn Spatial N Samples
B_Attention_ConvNN_Attn_Spatial_N_8 = B_Attention_ConvNN_Attn_Spatial_K_N(num_layers=8, num_classes=100, device='cuda')

print("Model: " + B_Attention_ConvNN_Attn_Spatial_N_8.name)
print("Num params: " + str(count_parameters(B_Attention_ConvNN_Attn_Spatial_N_8)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Attention_ConvNN_Attn_Spatial_N_8.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Attention_ConvNN_Attn_Spatial_N_8, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Attention_ConvNN_Attn_Spatial_N_8, cifar100.test_loader, device='cuda')


Model: B_Attention_ConvNN_Attn_Spatial_K_N
Num params: 3157476

Epoch 1, Time: 42.588977336883545, Loss: 4.608345166496608
Epoch 1, Accuracy: 1.11%
Epoch 2, Time: 42.35442543029785, Loss: 4.605258856892891
Epoch 2, Accuracy: 1.02%
Epoch 3, Time: 42.31302094459534, Loss: 4.605271471735766
Epoch 3, Accuracy: 1.04%
Epoch 4, Time: 40.598278522491455, Loss: 4.605241086476904
Epoch 4, Accuracy: 1.0%
Epoch 5, Time: 41.971935987472534, Loss: 4.605237823008271
Epoch 5, Accuracy: 1.02%
Epoch 6, Time: 42.2992901802063, Loss: 4.605235879683433
Epoch 6, Accuracy: 0.99%
Epoch 7, Time: 41.27410435676575, Loss: 4.605238096183523
Epoch 7, Accuracy: 1.0%
Epoch 8, Time: 43.01552963256836, Loss: 4.605232753412193
Epoch 8, Accuracy: 1.0%
Epoch 9, Time: 40.908488512039185, Loss: 4.605233618670412
Epoch 9, Accuracy: 1.0%
Epoch 10, Time: 41.964683055877686, Loss: 4.605233775990088
Epoch 10, Accuracy: 1.0%
Epoch 11, Time: 42.68009686470032, Loss: 4.605232071693596
Epoch 11, Accuracy: 1.0%
Epoch 12, Time: 42.56

1.0

In [9]:

# Branching Attention Conv2d
B_Attention_Conv2d_8 = B_Attention_Conv2d(num_layers=8, num_classes=100, device='cuda')

print("Model: " + B_Attention_Conv2d_8.name)
print("Num params: " + str(count_parameters(B_Attention_Conv2d_8)))
print()

# Test + Eval
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(B_Attention_Conv2d_8.parameters(), lr=0.0001)
num_epochs = 100 
train_eval(B_Attention_Conv2d_8, cifar100.train_loader, cifar100.test_loader, criterion, optimizer, num_epochs, device='cuda')
evaluate_accuracy(B_Attention_Conv2d_8, cifar100.test_loader, device='cuda')



Model: B_Attention_Conv2d
Num params: 1824916

Epoch 1, Time: 30.260902166366577, Loss: 4.563486330344549
Epoch 1, Accuracy: 2.22%
Epoch 2, Time: 30.82672429084778, Loss: 4.319654408318307
Epoch 2, Accuracy: 3.95%
Epoch 3, Time: 30.171480655670166, Loss: 4.241277725495341
Epoch 3, Accuracy: 3.97%
Epoch 4, Time: 31.566109895706177, Loss: 4.206245794625539
Epoch 4, Accuracy: 4.59%
Epoch 5, Time: 30.62748098373413, Loss: 4.182598388713339
Epoch 5, Accuracy: 4.57%
Epoch 6, Time: 31.513408660888672, Loss: 4.16277901229956
Epoch 6, Accuracy: 4.69%
Epoch 7, Time: 31.444703817367554, Loss: 4.1460971725566305
Epoch 7, Accuracy: 5.35%
Epoch 8, Time: 30.857640027999878, Loss: 4.129822950533894
Epoch 8, Accuracy: 4.96%
Epoch 9, Time: 31.881611824035645, Loss: 4.112530844900614
Epoch 9, Accuracy: 5.57%
Epoch 10, Time: 31.165534257888794, Loss: 4.099359167811206
Epoch 10, Accuracy: 5.76%
Epoch 11, Time: 30.925509214401245, Loss: 4.058831531983202
Epoch 11, Accuracy: 7.07%
Epoch 12, Time: 30.39747595

19.8