In [None]:
# Compute mean image. This takes a while to run on CPU.
# http://stackoverflow.com/questions/7762948/how-to-convert-an-rgb-image-to-numpy-array

import os
from os import listdir
from os.path import isfile, join
from PIL import Image
import numpy as np

TRAIN_DIR = "data/images/train/"

# We use Knuth's method for computing running mean, sdev
m, S, n = 0, 0, 0

subdirs = [d for d in listdir(TRAIN_DIR) if not isfile(join(TRAIN_DIR, d))]
for i in range(len(subdirs)):
    d = subdirs[i]
    files = [f for f in listdir(join(TRAIN_DIR, d)) if isfile(join(TRAIN_DIR, d, f))]
    print("Processing: " + d + " " + str(i) + "/" + str(len(subdirs)))
    for f in files:
        file_path = join(TRAIN_DIR, d, f)
        img = Image.open(file_path)
        img.load()
        data = np.asarray(img, dtype="int32")
        
        x = np.average(data, axis=(0,1))
        n = n + 1
        m_prev = m
        m = m + (x - m) / n
        S = S + (x - m) * (x - m_prev)

mean = m
sdev = np.sqrt(S/(n-1))
print("Mean: " + str(mean))
print("Sdev: " + str(sdev))
print("Tensor mean: " + str(tensor_mean))
print("Tensor sdev: " + str(tensor_sdev))


In [None]:
torch.cuda.is_available()

In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim as optim
from torch.autograd import Variable
import torch.utils.data as data
from torch.utils.data import DataLoader
from torch.utils.data import sampler
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
import numpy as np
import json

%load_ext autoreload
%autoreload 2

In [2]:
from dataset import AttnDataset

TRAIN_DIR = "data/images/train/"
VAL_DIR = "data/images/val/"
TEST_DIR = "data/images/test/"
BBOX_DIR = "data/bbox/json/"

HOLDOUT_TRAIN_DIR = "data/images/holdout/train/"
HOLDOUT_VAL_DIR = "data/images/holdout/val/"
HOLDOUT_TEST_DIR = "data/images/holdout/test/"

BATCH_SIZE = 64

# Computed sample mean, sdev
mean = [0.48678957, 0.46590506, 0.41864723]
sdev = [0.15854293, 0.15514862, 0.18052906]

# ImageNet mean, sdev
# mean = [0.485, 0.456, 0.406]
# sdev = [0.229, 0.224, 0.225]

# https://github.com/pytorch/examples/blob/master/imagenet/main.py

TRAIN_TRANSFORMS = transforms.Compose([
        transforms.RandomSizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=mean, std=sdev)
    ])
VAL_TEST_TRANSFORMS = transforms.Compose([
        transforms.Scale(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=mean, std=sdev)
    ])

train_data = AttnDataset(TRAIN_DIR, BBOX_DIR, transform=TRAIN_TRANSFORMS)
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)

val_data = AttnDataset(VAL_DIR, BBOX_DIR, transform=VAL_TEST_TRANSFORMS)
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=True)

test_data = AttnDataset(TEST_DIR, BBOX_DIR, transform=VAL_TEST_TRANSFORMS)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True)

holdout_train_data = AttnDataset(HOLDOUT_TRAIN_DIR, BBOX_DIR, transform=TRAIN_TRANSFORMS)
holdout_train_loader = DataLoader(holdout_train_data, batch_size=BATCH_SIZE, shuffle=True)

holdout_val_data = AttnDataset(HOLDOUT_VAL_DIR, BBOX_DIR, transform=VAL_TEST_TRANSFORMS)
holdout_val_loader = DataLoader(holdout_val_data, batch_size=BATCH_SIZE, shuffle=True)

holdout_test_data = AttnDataset(HOLDOUT_TEST_DIR, BBOX_DIR, transform=VAL_TEST_TRANSFORMS)
holdout_test_loader = DataLoader(holdout_test_data, batch_size=BATCH_SIZE, shuffle=True)

# Number of expected minibatches
print(len(train_loader)) # Equivalent to print(len(train_data) / BATCH_SIZE)
print(len(val_loader))
print(len(test_loader))
print("Datasets loaded.")

1276
125
123
Datasets loaded.


In [None]:
for minibatch, (x, y) in enumerate(train_loader):
    x_var = Variable(x.type(torch.cuda.FloatTensor))
    y_var = Variable(y.type(torch.cuda.LongTensor))
    print(minibatch)
    # print(x_var.size())
    # print(y_var)
    # labels = y_var[:,0]
    # bounds = y_var[:,1:3]
    # bboxes = y_var[:,3:]
    # print(labels)
    # print(bounds)
    # print(boxes)
    break

In [3]:
# Training loop for host model

from datetime import datetime
from time import strftime

def train_host_model(model, train_loader, val_loader, 
               checkpoints_path, log_path, lr=1e-2, start_epoch=0, session_id=0):
        
    NUM_EPOCHS = 50
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
    epoch = start_epoch
    
    while epoch < NUM_EPOCHS:
        
        print("Epoch " + str(epoch))
        
        # Train
        print("  Train")
        model.train()
        for minibatch, (x, y) in enumerate(train_loader):
            
            x_var = Variable(x.type(torch.cuda.FloatTensor))
            labels = Variable(y[:,0].type(torch.cuda.LongTensor))
            
            scores = model.forward(x_var)
            loss_fn = torch.nn.CrossEntropyLoss()
            loss = loss_fn(scores, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if minibatch % 10 == 0:
                print("      " + str(minibatch) + " " + str(float(loss.data.cpu().numpy())))
            
        
        print("  Eval")
        num_correct = torch.zeros(1).type(torch.cuda.FloatTensor)
        num_samples = torch.zeros(1).type(torch.cuda.FloatTensor)
        model.eval()
        for minibatch, (x, y) in enumerate(val_loader):
            
            x_var = Variable(x.type(torch.cuda.FloatTensor), volatile=True)
            y = y.type(torch.cuda.LongTensor)

            scores = model.forward(x_var)
            _, preds = scores.data.max(1) # argmax

            num_correct += (preds == y[:,0]).sum()
            num_samples += preds.size(0)
            val_acc = (float(num_correct.cpu().numpy()) / num_samples.cpu().numpy())[0]

            if minibatch % 10 == 0:
                print("      " + str(minibatch) + " " + str(val_acc))
        
        sid = str(session_id)
        timenow = str(datetime.now().time())
        
        with open(log_path + sid + ".log", 'a') as f:
            f.write("".join([sid, "\t",
                        str(epoch), "\t",
                        str(val_acc), "\t",
                        timenow, "\n"]))
        
        torch.save(model.state_dict(), checkpoints_path + "session_" + sid + "_epoch_" + str(epoch) + ".checkpoint")
        
        epoch += 1
    

In [9]:
### RESNET

from models import ResnetHost

# Define model and verify output size
N_TRAIN_CLASSES = 160
N_TEST_CLASSES = 38

train_host_resnet = ResnetHost(n_classes=N_TRAIN_CLASSES)
train_host_resnet.cuda()
train_host_resnet.set_retrain(["conv5_x", "avg_pool", "fc"], True)

test_host_resnet = ResnetHost(n_classes=N_TEST_CLASSES)
test_host_resnet.cuda()
test_host_resnet.set_retrain(["conv5_x", "avg_pool", "fc"], True)


In [None]:
# 80% top-1 validation accuracy (160-way)
train_host_resnet.load_state_dict(torch.load("data/models/train_resnet_session_0_epoch_10.checkpoint"))
train_host_model(train_host_resnet, train_loader, val_loader,
                "data/models/train_resnet_", "data/logs/",
                lr=1e-3, start_epoch=11, session_id=0)

In [None]:
# 83.6% top-1 validation accuracy (38-way)
test_host_resnet.load_state_dict(torch.load("data/models/test_resnet_session_0_epoch_16.checkpoint"))
train_host_model(test_host_resnet, holdout_train_loader, holdout_val_loader,
                "data/models/test_resnet_", "data/logs/",
                lr=1e-3, start_epoch=17, session_id=0)

In [4]:
### VGG

from models import ModVggHost

# Define model and verify output size
N_TRAIN_CLASSES = 160
N_TEST_CLASSES = 38

train_host_vggnet = ModVggHost(n_classes=N_TRAIN_CLASSES)
train_host_vggnet.cuda()
train_host_vggnet.set_retrain(["bn2", "feats3", "bn3", "feats4", "bn4", "feats5", "bn5", "last"], True)

test_host_vggnet = ModVggHost(n_classes=N_TEST_CLASSES)
test_host_vggnet.cuda()
train_host_vggnet.set_retrain(["bn2", "feats3", "bn3", "feats4", "bn4", "feats5", "bn5", "last"], True)

In [6]:
# train_host_vggnet.load_state_dict(torch.load("data/models/train_vggnet_session_0_epoch_10.checkpoint"))
train_host_model(train_host_vggnet, train_loader, val_loader,
                "data/models/train_vggnet_", "data/logs/",
                lr=1e-3, start_epoch=0, session_id=0)

Epoch 0
  Train
      0 5.069231033325195
      10 4.877032279968262
      20 4.938404560089111
      30 5.020724296569824
      40 4.909976482391357
      50 5.062371253967285
      60 4.909876823425293
      70 4.970588684082031
      80 4.9219136238098145
      90 4.841250419616699
      100 4.859414100646973
      110 4.740114212036133
      120 4.833495616912842
      130 4.836550235748291
      140 4.946642875671387
      150 4.869873046875
      160 4.816774845123291
      170 4.744311332702637
      180 4.759790897369385
      190 4.6575140953063965
      200 4.638246059417725
      210 4.815683841705322
      220 4.856706619262695
      230 4.796492099761963
      240 5.200757026672363
      250 4.7848968505859375
      260 4.512701511383057
      270 4.5988993644714355
      280 4.4423017501831055
      290 4.7351393699646
      300 4.641568660736084
      310 4.798869609832764
      320 4.681436061859131
      330 4.615627288818359
      340 4.6568427085876465
      350 4.69

KeyboardInterrupt: 

In [None]:
# train_host_vggnet.load_state_dict(torch.load("data/models/train_vggnet_session_0_epoch_10.checkpoint"))
train_host_model(train_host_vggnet, holdout_train_loader, holdout_val_loader,
                "data/models/train_vggnet_", "data/logs/",
                lr=1e-2, start_epoch=0, session_id=0)

In [2]:
from dataset import AttnDataset

TRAIN_DIR = "data/images/train/"
VAL_DIR = "data/images/val/"
TEST_DIR = "data/images/test/"
BBOX_DIR = "data/bbox/json/"

HOLDOUT_TRAIN_DIR = "data/images/holdout/train/"
HOLDOUT_VAL_DIR = "data/images/holdout/val/"
HOLDOUT_TEST_DIR = "data/images/holdout/test/"

BATCH_SIZE = 64

mean = [0.48678957, 0.46590506, 0.41864723]
sdev = [0.15854293, 0.15514862, 0.18052906]
arrmean = np.array(mean)
arrsdev = np.array(sdev)

IMG_TRANSFORMS = transforms.Compose([
        transforms.Scale(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=mean, std=sdev)
    ])

REVERSE_IMG_TRANSFORMS = transforms.Compose([
        transforms.Normalize(mean=[0, 0, 0], std=(1.0 / arrsdev).tolist()),
        transforms.Normalize(mean=(-arrmean).tolist(), std=[1, 1, 1]),
        # Can't reverse crops or scaling as the information is lost
        transforms.ToPILImage(),
    ])

def BBOX_TRANSFORMS(y):
    synsetid, xmax, ymax = y[0], y[1], y[2] # width, height
    x1, y1, x2, y2 = y[3], y[4], y[5], y[6]
    
    # Scale 256: https://github.com/pytorch/vision/blob/master/torchvision/transforms.py
    
    if (xmax <= ymax and xmax == 256) or (ymax <= xmax and ymax == 256):
        pass
    elif xmax < ymax:
        ymax_new = 256.0 * ymax / xmax
        y1 = int(y1 * (ymax_new / ymax))
        y2 = int(y2 * (ymax_new / ymax))
        ymax = int(ymax_new)
        xmax_new = 256.0
        x1 = int(x1 * (xmax_new / xmax))
        x2 = int(x2 * (xmax_new / xmax))
        xmax = int(xmax_new)
    else:
        xmax_new = 256.0 * xmax / ymax
        x1 = int(x1 * (xmax_new / xmax))
        x2 = int(x2 * (xmax_new / xmax))
        xmax = int(xmax_new)
        ymax_new = 256.0
        y1 = int(y1 * (ymax_new / ymax))
        y2 = int(y2 * (ymax_new / ymax))
        ymax = int(ymax_new)

    # CenterCrop 224
    
    cx1 = int(round((xmax-224)) / 2.0) # Get crop coordinates
    cy1 = int(round((ymax-224)) / 2.0)
    cx2 = cx1 + 224
    cy2 = cy1 + 224
    
    x1 = min(max(x1, cx1), cx2) - cx1 # Constrain bounds and reset coordinate system
    y1 = min(max(y1, cy1), cy2) - cy1
    x2 = min(max(x2, cx1), cx2) - cx1
    y2 = min(max(y2, cy1), cy2) - cy1
    xmax, ymax = 224, 224

    return np.array([synsetid, xmax, ymax, x1, y1, x2, y2]).astype(int)


train_bbox_data = AttnDataset(TRAIN_DIR, BBOX_DIR, 
                              transform=IMG_TRANSFORMS, target_transform=BBOX_TRANSFORMS)
train_bbox_loader = DataLoader(train_bbox_data, batch_size=BATCH_SIZE, shuffle=True)


val_bbox_data = AttnDataset(VAL_DIR, BBOX_DIR, 
                            transform=IMG_TRANSFORMS, target_transform=BBOX_TRANSFORMS)
val_bbox_loader = DataLoader(val_bbox_data, batch_size=BATCH_SIZE, shuffle=True)


test_bbox_data = AttnDataset(TEST_DIR, BBOX_DIR, 
                             transform=IMG_TRANSFORMS, target_transform=BBOX_TRANSFORMS)
test_bbox_loader = DataLoader(test_bbox_data, batch_size=BATCH_SIZE, shuffle=True)


holdout_train_bbox_data = AttnDataset(HOLDOUT_TRAIN_DIR, BBOX_DIR, 
                                      transform=IMG_TRANSFORMS, target_transform=BBOX_TRANSFORMS)
holdout_train_bbox_loader = DataLoader(holdout_train_bbox_data, batch_size=BATCH_SIZE, shuffle=True)


holdout_val_bbox_data = AttnDataset(HOLDOUT_VAL_DIR, BBOX_DIR, 
                                    transform=IMG_TRANSFORMS, target_transform=BBOX_TRANSFORMS)
holdout_val_bbox_loader = DataLoader(holdout_val_bbox_data, batch_size=BATCH_SIZE, shuffle=True)


holdout_test_bbox_data = AttnDataset(HOLDOUT_TEST_DIR, BBOX_DIR, 
                                     transform=IMG_TRANSFORMS, target_transform=BBOX_TRANSFORMS)
holdout_test_bbox_loader = DataLoader(holdout_test_bbox_data, batch_size=BATCH_SIZE, shuffle=True)

print("Datasets loaded.")

Datasets loaded.


In [19]:
# Training loop for aux model

from datetime import datetime
from time import strftime

def compute_iou(label_bboxes, output_bboxes):
    """
    Given two bboxes a and b, computes the intersection over union loss
    The bbox format I'm using is a tensor of length 6 in the format [width, height, xmin, ymin, xmax, ymax]
    With batch size N this is an Nx6 tensor
    """
    
    a = label_bboxes # 6 values
    b = output_bboxes.data # 4 values
    
    # https://stackoverflow.com/questions/27152904/calculate-overlapped-area-between-two-rectangles
    # a = a.type(torch.cuda.FloatTensor)
    # b = b.type(torch.cuda.FloatTensor)
    
    a = a[:,2:]
    a_xmin, a_ymin, a_xmax, a_ymax = a[:,0], a[:,1], a[:,2], a[:,3]
    b_xmin, b_ymin, b_xmax, b_ymax = b[:,0], b[:,1], b[:,2], b[:,3]
    
    # Intersection
    xmin = torch.stack((a_xmin, b_xmin), dim=0)
    xmax = torch.stack((a_xmax, b_xmax), dim=0)
    ymin = torch.stack((a_ymin, b_ymin), dim=0)
    ymax = torch.stack((a_ymax, b_ymax), dim=0)
    
    dx = (torch.min(xmax, 0)[0] - torch.max(xmin, 0)[0]).squeeze()
    dy = (torch.min(ymax, 0)[0] - torch.max(ymin, 0)[0]).squeeze()
    
    # If (dx <= 0) or (dy <= 0), no intersection
    mask1 = (dx >= 0)
    mask2 = (dy >= 0)
    mask = (mask1 * mask2).type(torch.cuda.FloatTensor)
    intersection = dx * dy
    intersection = intersection * mask
    
    # Union
    area_a = (a_xmax - a_xmin) * (a_ymax - a_ymin)
    area_b =  (b_xmax - b_xmin) * (b_ymax - b_ymin)
    
    union = area_a + area_b - intersection
    iou = intersection / union
    # print(torch.stack((area_a, area_b, intersection, union, iou),dim=0).transpose(1,0))
    # print(torch.stack((a_xmin, a_ymin, a_xmax, a_ymax, b_xmin, b_ymin, b_xmax, b_ymax),dim=0).transpose(1,0))
    # print(torch.stack((area_a, area_b, intersection, union, iou),dim=0).transpose(1,0))
    return iou
    

def train_full_model(model, train_loader, val_loader, 
               checkpoints_path, log_path, lr=1e-2, start_epoch=0, session_id=0):
        
    NUM_EPOCHS = 50
    # optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
    epoch = start_epoch
    
    while epoch < NUM_EPOCHS:
        
        print("Epoch " + str(epoch))
        
        # Train
        """
        print("  Train")
        model.train()
        for minibatch, (x, y) in enumerate(train_loader):
            
            x_var = Variable(x.type(torch.cuda.FloatTensor))
            label_bboxes = Variable(y[:,1:].type(torch.cuda.FloatTensor))
            
            (scores, bboxes) = model.forward(x_var) # Model's bbox outout only has 4 values per image
            # This loss makes the aux net indifferent to misclassifications by the base model
            loss_fn = torch.nn.MSELoss()
            loss = loss_fn(bboxes, label_bboxes[:,2:])
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if minibatch % 10 == 0:
                print("      " + str(minibatch) + " " + str(float(loss.data.cpu().numpy())))
        """
            
        print("  Eval")
        iou_sum = torch.zeros(1).type(torch.cuda.FloatTensor)
        imagenet_loc_sum = torch.zeros(1).type(torch.cuda.FloatTensor)
        num_samples = torch.zeros(1).type(torch.cuda.FloatTensor)
        model.eval()
        for minibatch, (x, y) in enumerate(val_loader):
            
            x_var = Variable(x.type(torch.cuda.FloatTensor), volatile=True)
            label_bboxes = y[:,1:].type(torch.cuda.FloatTensor)

            (scores, bboxes) = model.forward(x_var)
            iou = compute_iou(label_bboxes, bboxes)
            iou_sum += torch.sum(iou)
            
            mask = (iou >= 0.5).type(torch.cuda.FloatTensor)
            imagenet_loc_sum += torch.sum(iou * mask)
            
            num_samples += bboxes.size(0)
            
            mean_iou = float(iou_sum.cpu().numpy()[0]) / num_samples.cpu().numpy()[0]
            mean_imagenet_loc = float(imagenet_loc_sum.cpu().numpy()[0]) / num_samples.cpu().numpy()[0]
            
            if minibatch % 10 == 0:
                print("      " + str(minibatch) + " " + str(mean_iou) + " " + str(mean_imagenet_loc))
        
        sid = str(session_id)
        timenow = str(datetime.now().time())
        
        with open(log_path + sid + ".log", 'a') as f:
            f.write("".join([sid, "\t",
                        str(epoch), "\t",
                        str(mean_iou), "\t",
                        timenow, "\n"]))
        
        torch.save(model.state_dict(), checkpoints_path + "session_" + sid + "_epoch_" + str(epoch) + ".checkpoint")
        
        epoch += 1

In [21]:
from models import ResnetHost, AuxNet, AuxResNet

N_TRAIN_CLASSES = 160

# Load training host (resnet) and freeze all parameters
train_host_resnet = ResnetHost(n_classes=N_TRAIN_CLASSES)
train_host_resnet.load_state_dict(torch.load("data/models/train_resnet_session_0_epoch_10.checkpoint"))
train_host_resnet.freeze_weights()

# Initialize new auxnet
aux_net = AuxNet(spatial_size=28, channels=3)
aux_net.set_retrain(["conv", "fc"], True)

# Define the full network
train_net = AuxResNet(train_host_resnet, aux_net)
train_net.cuda()
train_net.set_retrain([], True)


In [22]:
# load weights
train_net.load_state_dict(torch.load("data/models/train_full_session_10_epoch_1.checkpoint"))

In [None]:
train_full_model(train_net, train_bbox_loader, val_bbox_loader, 
               "data/models/train_full_", "data/logs/",
                lr=1e-4, start_epoch=0, session_id=10)

In [27]:
from models import ResnetHost, AuxNet, AuxResNet

N_TEST_CLASSES = 38

# Load test host (resnet) and freeze all parameters
test_host_resnet = ResnetHost(n_classes=N_TEST_CLASSES)
test_host_resnet.load_state_dict(torch.load("data/models/test_resnet_session_0_epoch_16.checkpoint"))
test_host_resnet.freeze_weights()

# Load trained auxnet
aux_net = train_net.auxnet
aux_net.freeze_weights()

# Or, initialize new AuxNet
# aux_net = AuxNet(spatial_size=28, channels=3)
# aux_net.set_retrain(["conv", "fc"], True)

test_net = AuxResNet(test_host_resnet, aux_net)
test_net.cuda()
train_net.set_retrain([], True)


In [28]:
test_net.load_state_dict(torch.load("data/models/test_newaux_full_session_0_epoch_4.checkpoint"))

In [None]:
train_full_model(test_net, holdout_train_bbox_loader, holdout_val_bbox_loader,
                "data/models/test_newaux_full_", "data/logs/",
                 lr=1e-3, start_epoch=2, session_id=1)