In [1]:
%matplotlib inline
import sys
sys.path.append('../PythonAPI')
import torch
import torchvision
import torchvision.transforms as transforms
import numpy as np
from cbas import CBAS
from pycocotools.coco import COCO
from pycocotools.cbas import CBAS
import cbas_construction_utils as ccu
import sklearn as sk

### Load COCO

In [2]:
# initialize COCO api for instance annotations and category info
cbas80=CBAS('../annotations/{}.json'.format('cbas80'))

# Get category index so we can go from image ids to category names
idToName={}
for c in cbas80.dataset['categories']:
    idToName[str(c['id'])]=c['name']

print("idToName: ", len(idToName.items()), idToName)
# base set is cbas34, and holdout set is cbas80-cbas34
# holdout is cbas80 - cbas34

loading annotations into memory...
Done (t=0.38s)
creating index...
index created!
idToName:  80 {'1': 'person', '2': 'bicycle', '3': 'car', '4': 'motorcycle', '5': 'airplane', '6': 'bus', '7': 'train', '8': 'truck', '9': 'boat', '10': 'traffic light', '11': 'fire hydrant', '13': 'stop sign', '14': 'parking meter', '15': 'bench', '16': 'bird', '17': 'cat', '18': 'dog', '19': 'horse', '20': 'sheep', '21': 'cow', '22': 'elephant', '23': 'bear', '24': 'zebra', '25': 'giraffe', '27': 'backpack', '28': 'umbrella', '31': 'handbag', '32': 'tie', '33': 'suitcase', '34': 'frisbee', '35': 'skis', '36': 'snowboard', '37': 'sports ball', '38': 'kite', '39': 'baseball bat', '40': 'baseball glove', '41': 'skateboard', '42': 'surfboard', '43': 'tennis racket', '44': 'bottle', '46': 'wine glass', '47': 'cup', '48': 'fork', '49': 'knife', '50': 'spoon', '51': 'bowl', '52': 'banana', '53': 'apple', '54': 'sandwich', '55': 'orange', '56': 'broccoli', '57': 'carrot', '58': 'hot dog', '59': 'pizza', '60': 

### Load cbas_34 base and holdout (AKA "target" classes) sets: 

In [3]:
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]

transform = transforms.Compose([
#     transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
])

base_train = torchvision.datasets.ImageFolder(root='../images/cbas34_train', transform=transform)
base_train_loader = torch.utils.data.DataLoader(base_train, batch_size=4, shuffle=True, num_workers=4)

base_valid = torchvision.datasets.ImageFolder(root='../images/cbas34_val', transform=transform)
base_valid_loader = torch.utils.data.DataLoader(base_valid, batch_size=4, shuffle=False, num_workers=2)

# get index for curriculum sampling
id2idx = {}
for i,img in enumerate(base_train.imgs):
    img_id_str = img[0].split('/')[4].split('.')[0]
    id2idx[img_id_str] = i


In [4]:
print(base_train.__dict__.keys())
print(base_train.imgs[3])
print("Base classes ({} total): ".format(len(base_train.classes)))
print(base_train.classes)

print("base_train size: ", base_train.__len__())
print("steps per epoch: ", int(base_train.__len__() / 4))

dict_keys(['root', 'imgs', 'classes', 'class_to_idx', 'transform', 'target_transform', 'loader'])
('../images/cbas34_train/airplane/1363089.jpg', 0)
Base classes (34 total): 
['airplane', 'backpack', 'banana', 'bench', 'bicycle', 'bird', 'boat', 'book', 'bottle', 'bowl', 'car', 'carrot', 'chair', 'clock', 'cow', 'cup', 'donut', 'fork', 'handbag', 'horse', 'kite', 'knife', 'person', 'pottedplant', 'sheep', 'sink', 'skateboard', 'spoon', 'surfboard', 'tennisracket', 'trafficlight', 'truck', 'umbrella', 'vase']
base_train size:  51000
steps per epoch:  12750


### Find the holdout categories: 

In [5]:
# all_categories = COCO.loadCats(COCO.getCatIds())
# nms=[cat['name'] for cat in all_categories]
# print('COCO categories: \n{}\n'.format(' '.join(nms)))

# Get all categories: 


### Train network that learns to predict images from base_set: 

In [6]:
import torch.nn as nn

__all__ = ['alexnet3232']


class AlexNet3232(nn.Module):

    def __init__(self, num_classes=34):
        super(AlexNet3232, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=5),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.classifier = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x


def alexnet3232(**kwargs):
    r"""AlexNet model architecture from the
    `"One weird trick..." <https://arxiv.org/abs/1404.5997>`_ paper.
    """
    model = AlexNet(**kwargs)
    return model

In [7]:
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models

# num_classes_base = 34
# num_classes_holdout = 34

# Load model
class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1   = nn.Linear(16*5*5, 120)
        self.fc2   = nn.Linear(120, 84)
        self.fc3   = nn.Linear(84, 34)

    def forward(self, x):
        out = F.relu(self.conv1(x))
        out = F.max_pool2d(out, 2)
        out = F.relu(self.conv2(out))
        out = F.max_pool2d(out, 2)
        out = out.view(out.size(0), -1)
        out = F.relu(self.fc1(out))
        out = F.relu(self.fc2(out))
        out = self.fc3(out)
        return out


# model = LeNet()
model = AlexNet3232()

In [8]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [9]:
# epochs = 20
# print_every = 100
# steps_per_epoch = int(base_train.__len__() / 4)

# for epoch in range(epochs):  # loop over the dataset multiple times

#     running_loss = 0.0
#     for i, data in enumerate(base_train_loader, 0):
#         # get the inputs
#         inputs, labels = data

#         # wrap them in Variable
#         inputs, labels = Variable(inputs), Variable(labels)

#         # zero the parameter gradients
#         optimizer.zero_grad()

#         # forward + backward + optimize
#         outputs = model(inputs)
#         loss = criterion(outputs, labels)
#         loss.backward()
#         optimizer.step()

#         # print statistics
#         running_loss += loss.data[0]
#         if i % print_every == (print_every-1):    # print every 2000 mini-batches
#             print('epoch[%d/%d, %5d/%d], loss: %.3f' %
#                   (epoch + 1, epochs, i + 1, steps_per_epoch, running_loss / print_every))
#             running_loss = 0.0

# print('Finished Training')

In [10]:
import numpy as np 
from PIL import Image

class FeatureExtractor(object):
    
    def __init__(self, model=None, embed_layer=None, embed_size=256, transform=None):
        self.embed_size = embed_size
        self.transforms = transform
        
        if model is None: 
            self.model = models.alexnet(pretrained=True)
            self.embed_layer = self.model.features
        else: 
            if embed_layer is None: 
                raise ValueError("Need to specify embed_layer if you pass in a model to FeatureExtractor!")
            self.model = model
            self.embed_layer = embed_layer
        
        self.cuda = torch.cuda.is_available()
        if self.cuda:
            self.model.cuda()
            
        # Set model to eval mode so any train-specific things like dropout, etc. don't run:
        self.model.eval()
    
    def embed(self, img):
        """
        project a PIL image into embedded feature space, and return that vector as an np array
        """
        a = self.transforms(img)
        image = Variable(a)
        image = image.unsqueeze(0)
        if self.cuda: image.cuda()
        
        embedding = torch.zeros(self.embed_size)
        def copy_embedding(m, i, o):
            if len(o.size()) > 2:
                o = o.view(o.size(0), -1)
            embedding.copy_(o.data)
            
        h = self.embed_layer.register_forward_hook(copy_embedding)
        h_x = self.model(image)
        h.remove()
        return embedding.numpy()

In [11]:
import os 

weights_dir = './weights/'
if not os.path.exists(weights_dir):
    os.makedirs(weights_dir)
# torch.save(model, os.path.join(weights_dir, 'alexnet_cbas34_baseset_20_epochs.pt'))
model = torch.load(os.path.join(weights_dir, 'alexnet_cbas34_baseset_20_epochs.pt'))

In [12]:
embedder = FeatureExtractor(model=model, transform=transform, embed_layer=model.features)

In [25]:

# get some random training images
dataiter = iter(base_train_loader)
images, labels = dataiter.next()

# Test embedding one image into embedding space. For Alexnet (32x32 images) this should be a 256-d vector: 
img = Image.open('../images/cbas34_train/airplane/156356.jpg')
print(embedder.embed(img).shape)

(256,)




### Check results
Now let's compute cosine similarities for one airplane image against all airplanes, and then similarities for airplane against several other non-airplane categories. The average cosine similarity for airplane vs. airplane should be higher than that for airplane-vs-non-airplanes. 

In [20]:
import glob 

def get_cos_similarities(base_img_path, other_images_path):
    img = Image.open(base_img_path)
    base_embedding = embedder.embed(img)
    other_imgs = glob.glob(other_images_path + '*.jpg')
    similarities = np.zeros((len(other_imgs)))
    
    for idx,img_path in enumerate(other_imgs):
        #print("idx: ", idx, img_path)
        embedding = None
        img = Image.open(img_path)
        try:
            embedding = embedder.embed(img)
        except:
            print("error on image #", idx)
        if embedding is not None: 
            cos = nn.CosineSimilarity(dim=1, eps=1e-6)
            cos_sim = cos(torch.Tensor(base_embedding).unsqueeze(0), torch.Tensor(embedding).unsqueeze(0))
            similarities[idx] = cos_sim.numpy()
    return similarities

other_cats = ["cow", "banana", "sink", "carrot", "umbrella"]
print("Testing airplane cosine similarity against airplanes and against: ", other_cats)

print("Testing cosine similarities for base airplane against other airplane images...")
airplane_vs_airplane_similarities = get_cos_similarities(
    '../images/cbas34_train/airplane/156356.jpg'
    , '../images/cbas34_train/airplane/'
)
print("average airplane similarity vs other airplanes: ", np.average(airplane_vs_airplane_similarities))
print()

for other_cat in other_cats: 
    print("Testing cosine similarities for base airplane against NON-airplane images from category: {}...".format(other_cat))
    airplane_vs_other_similarities = get_cos_similarities(
        '../images/cbas34_train/airplane/156356.jpg'
        , '../images/cbas34_train/{}/'.format(other_cat)
    )
    print("average airplane cos-similarity vs '{}': {}".format(other_cat, np.average(airplane_vs_other_similarities)))
    print()


Testing airplane cosine similarity against airplanes and against:  ['cow', 'banana', 'sink', 'carrot', 'umbrella']
Testing cosine similarities for base airplane against other airplane images...




error on image # 81
error on image # 91
error on image # 316
error on image # 330
error on image # 548
error on image # 872
error on image # 1027
error on image # 1115
error on image # 1171
error on image # 1347
error on image # 1415
error on image # 1442
error on image # 1481
average airplane similarity vs other airplanes:  0.41849434301505484

Testing cosine similarities for base airplane against NON-airplane images from category: cow...
error on image # 689
average airplane cos-similarity vs 'cow': 0.13348377155760924

Testing cosine similarities for base airplane against NON-airplane images from category: banana...
average airplane cos-similarity vs 'banana': 0.11696031159038345

Testing cosine similarities for base airplane against NON-airplane images from category: sink...
average airplane cos-similarity vs 'sink': 0.18486352485930546

Testing cosine similarities for base airplane against NON-airplane images from category: carrot...
average airplane cos-similarity vs 'carrot': 0.