# Import libraries

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models

# Instantiate and make changes to model

In [0]:
net_IBE = models.resnet18(pretrained=False)

# Replacing the first 7x7 conv stride of 4 with a 3x3 convolution kernel with 
# stride of 1 and replacing maxpool with upsample to keep spatial features from 
# being downsampled too quickly
net_IBE.conv1 = nn.Conv2d(3,64,3,stride=1,padding=1)
net_IBE_ = list(net_IBE.children())[:-2]
net_IBE_[3] = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False)

# The linear layer is replaced with a "pixelwise linear layer"
classifier = nn.Conv2d(512,10,1)
torch.nn.init.kaiming_normal_(classifier.weight)
net_IBE_.append(classifier)

# No spatial aggregation; left with 10x32x32 tensor, can be used the same way as 
# a semantic segmentation output
net_IBE_.append(nn.Upsample(size=32, mode='bilinear', align_corners=False))
net_IBE = nn.Sequential(*net_IBE_)

# Define attention

In [0]:
# Logsumexp is a smooth maximum;
# selecting the largest component of the logit vector and then running it through 
# sigmoid, get a value with a support from 0 to 1
def attention(x):
    return torch.sigmoid(torch.logsumexp(x, 1, keepdim=True))

# Set up criterion, optimizer, lr_scheduler

In [0]:
# The model learns a binary classifier for each class; 
# if any class is detected then the attention is closer to 1, if not, closer to 0
criterion_IBE = nn.BCEWithLogitsLoss()
optimizer_IBE = optim.SGD(net_IBE.parameters(), lr=0.05, momentum=0.9, weight_decay=5e-4)
lr_scheduler_IBE = optim.lr_scheduler.MultiStepLR(optimizer_IBE, milestones=[20, 25], gamma=0.1)

In [0]:
# Save initial state of model
state = {
    'net': net_IBE.state_dict(),
    'optimizer': optimizer_IBE.state_dict(),
    'lr_scheduler': lr_scheduler_IBE.state_dict(),
}
torch.save(state, 'initial_state_IBE.pth')