In [1]:
%matplotlib inline
import sys
sys.path.append('../PythonAPI')
import torch
import torchvision
import torchvision.transforms as transforms
import numpy as np
from cbas import CBAS
from pycocotools.coco import COCO
from pycocotools.cbas import CBAS
import cbas_construction_utils as ccu
import sklearn as sk

### Load COCO

In [2]:
# initialize COCO api for instance annotations and category info
cbas80=CBAS('../annotations/{}.json'.format('cbas80'))

# Get category index so we can go from image ids to category names
idToName={}
for c in cbas80.dataset['categories']:
    idToName[str(c['id'])]=c['name']

print("idToName: ", len(idToName.items()), idToName)
# base set is cbas34, and holdout set is cbas80-cbas34
# holdout is cbas80 - cbas34

loading annotations into memory...
Done (t=0.35s)
creating index...
index created!
idToName:  80 {'1': 'person', '2': 'bicycle', '3': 'car', '4': 'motorcycle', '5': 'airplane', '6': 'bus', '7': 'train', '8': 'truck', '9': 'boat', '10': 'traffic light', '11': 'fire hydrant', '13': 'stop sign', '14': 'parking meter', '15': 'bench', '16': 'bird', '17': 'cat', '18': 'dog', '19': 'horse', '20': 'sheep', '21': 'cow', '22': 'elephant', '23': 'bear', '24': 'zebra', '25': 'giraffe', '27': 'backpack', '28': 'umbrella', '31': 'handbag', '32': 'tie', '33': 'suitcase', '34': 'frisbee', '35': 'skis', '36': 'snowboard', '37': 'sports ball', '38': 'kite', '39': 'baseball bat', '40': 'baseball glove', '41': 'skateboard', '42': 'surfboard', '43': 'tennis racket', '44': 'bottle', '46': 'wine glass', '47': 'cup', '48': 'fork', '49': 'knife', '50': 'spoon', '51': 'bowl', '52': 'banana', '53': 'apple', '54': 'sandwich', '55': 'orange', '56': 'broccoli', '57': 'carrot', '58': 'hot dog', '59': 'pizza', '60': 

### Load cbas_34 base and holdout (AKA "target" classes) sets: 

In [5]:
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]

transform = transforms.Compose([
#     transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
])

base_train = torchvision.datasets.ImageFolder(root='../images/cbas34_train', transform=transform)
base_train_loader = torch.utils.data.DataLoader(base_train, batch_size=4, shuffle=True, num_workers=4)

base_valid = torchvision.datasets.ImageFolder(root='../images/cbas34_val', transform=transform)
base_valid_loader = torch.utils.data.DataLoader(base_valid, batch_size=4, shuffle=False, num_workers=2)

# get index for curriculum sampling
id2idx = {}
for i,img in enumerate(base_train.imgs):
    img_id_str = img[0].split('/')[4].split('.')[0]
    id2idx[img_id_str] = i


In [6]:
print(base_train.__dict__.keys())
print(base_train.imgs[3])
print("Base classes ({} total): ".format(len(base_train.classes)))
print(base_train.classes)

print("base_train size: ", base_train.__len__())
print("steps per epoch: ", int(base_train.__len__() / 4))

dict_keys(['root', 'imgs', 'classes', 'class_to_idx', 'transform', 'target_transform', 'loader'])
('../images/cbas34_train/airplane/1363089.jpg', 0)
Base classes (34 total): 
['airplane', 'backpack', 'banana', 'bench', 'bicycle', 'bird', 'boat', 'book', 'bottle', 'bowl', 'car', 'carrot', 'chair', 'clock', 'cow', 'cup', 'donut', 'fork', 'handbag', 'horse', 'kite', 'knife', 'person', 'pottedplant', 'sheep', 'sink', 'skateboard', 'spoon', 'surfboard', 'tennisracket', 'trafficlight', 'truck', 'umbrella', 'vase']
base_train size:  51000
steps per epoch:  12750


### Find the holdout categories: 

In [7]:
# all_categories = COCO.loadCats(COCO.getCatIds())
# nms=[cat['name'] for cat in all_categories]
# print('COCO categories: \n{}\n'.format(' '.join(nms)))

# Get all categories: 


### Train network that learns to predict images from base_set: 

In [8]:
import torch.nn as nn

__all__ = ['alexnet3232']


class AlexNet3232(nn.Module):

    def __init__(self, num_classes=34):
        super(AlexNet3232, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=5),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.classifier = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x


def alexnet3232(**kwargs):
    r"""AlexNet model architecture from the
    `"One weird trick..." <https://arxiv.org/abs/1404.5997>`_ paper.
    """
    model = AlexNet(**kwargs)
    return model

In [9]:
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models

# num_classes_base = 34
# num_classes_holdout = 34

# Load model
class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1   = nn.Linear(16*5*5, 120)
        self.fc2   = nn.Linear(120, 84)
        self.fc3   = nn.Linear(84, 34)

    def forward(self, x):
        out = F.relu(self.conv1(x))
        out = F.max_pool2d(out, 2)
        out = F.relu(self.conv2(out))
        out = F.max_pool2d(out, 2)
        out = out.view(out.size(0), -1)
        out = F.relu(self.fc1(out))
        out = F.relu(self.fc2(out))
        out = self.fc3(out)
        return out


# model = LeNet()
model = AlexNet3232()

In [10]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [11]:
# epochs = 20
# print_every = 100
# steps_per_epoch = int(base_train.__len__() / 4)

# for epoch in range(epochs):  # loop over the dataset multiple times

#     running_loss = 0.0
#     for i, data in enumerate(base_train_loader, 0):
#         # get the inputs
#         inputs, labels = data

#         # wrap them in Variable
#         inputs, labels = Variable(inputs), Variable(labels)

#         # zero the parameter gradients
#         optimizer.zero_grad()

#         # forward + backward + optimize
#         outputs = model(inputs)
#         loss = criterion(outputs, labels)
#         loss.backward()
#         optimizer.step()

#         # print statistics
#         running_loss += loss.data[0]
#         if i % print_every == (print_every-1):    # print every 2000 mini-batches
#             print('epoch[%d/%d, %5d/%d], loss: %.3f' %
#                   (epoch + 1, epochs, i + 1, steps_per_epoch, running_loss / print_every))
#             running_loss = 0.0

# print('Finished Training')

In [12]:
import os 

# Save model trained on cbas-LS base set:
weights_dir = './weights/'
if not os.path.exists(weights_dir):
    os.makedirs(weights_dir)
# torch.save(model, os.path.join(weights_dir, 'alexnet_cbas34_baseset_20_epochs.pt'))

In [13]:
import numpy as np 
from PIL import Image

class FeatureExtractor(object):
    
    def __init__(self, model=None, embed_layer=None, embed_size=256, transform=None):
        self.embed_size = embed_size
        self.transforms = transform
        
        if model is None: 
            self.model = models.alexnet(pretrained=True)
            self.embed_layer = self.model.features
        else: 
            if embed_layer is None: 
                raise ValueError("Need to specify embed_layer if you pass in a model to FeatureExtractor!")
            self.model = model
            self.embed_layer = embed_layer
        
        self.cuda = torch.cuda.is_available()
        if self.cuda:
            self.model.cuda()
            
        # Set model to eval mode so any train-specific things like dropout, etc. don't run:
        self.model.eval()
    
    def embed(self, img):
        """
        project a PIL image into embedded feature space, and return that vector as an np array
        """
        a = self.transforms(img)
        image = Variable(a)
        image = image.unsqueeze(0)
        if self.cuda: image.cuda()
        
        embedding = torch.zeros(self.embed_size)
        def copy_embedding(m, i, o):
            if len(o.size()) > 2:
                o = o.view(o.size(0), -1)
            embedding.copy_(o.data)
            
        h = self.embed_layer.register_forward_hook(copy_embedding)
        h_x = self.model(image)
        h.remove()
        return embedding.numpy()

In [14]:
import os 

# Load model pre-trained on cbas-LS base set: 
weights_dir = './weights/'
if not os.path.exists(weights_dir):
    os.makedirs(weights_dir)

model = torch.load(os.path.join(weights_dir, 'alexnet_cbas34_baseset_20_epochs.pt'))

In [15]:
embedder = FeatureExtractor(model=model, transform=transform, embed_layer=model.features)

In [16]:

# get some random training images
dataiter = iter(base_train_loader)
images, labels = dataiter.next()

# Test embedding a couple of images. For Alexnet (32x32 images) this should be a 256-d vector: 
img = Image.open('../images/cbas34_train/airplane/156356.jpg')
emb = embedder.embed(img)
print(emb.shape, emb)

img = Image.open('../images/cbas34_train/cow/69943.jpg')
emb = embedder.embed(img)
print(emb.shape, emb)

(256,) [ 0.          0.          0.          0.21962775  1.69603479  0.          0.
  0.04539941  4.62165022  0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.19562031  0.
  1.13748956  0.          0.          0.52092582  0.          1.02101254
  0.04542714  0.83863664  0.          0.          0.          0.06142612
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          2.1127789   0.          2.32432532  0.          0.
  0.85070014  0.          4.21610785  0.          0.35894951  1.11421156
  0.          0.22727358  0.          0.          0.          0.
  1.37570739  5.07413292  0.          0.          0.          0.
  0.20261103  0.          0.          0.          0.          0.          0.
  0.4270733   0.          0.          0.          0.          0.          0.
  0.          1.55296087  2.93087673  0.

Process Process-4:
Process Process-2:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/gbiamby/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/gbiamby/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Process Process-1:
  File "/home/gbiamby/anaconda3/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 36, in _worker_loop
    r = index_queue.get()
Traceback (most recent call last):
Process Process-3:
  File "/home/gbiamby/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/gbiamby/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
  File "/home/gbiamby/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/gbiamby/an

### Check Embedding Results
Now let's compute cosine similarities for one airplane image against all airplanes, and then similarities for airplane against several other non-airplane categories. The average cosine similarity for airplane vs. airplane should be higher than that for airplane-vs-non-airplanes, if we are computing good embeddings. 

In [39]:
import glob 
from sklearn.metrics.pairwise import cosine_similarity
np.random.seed(55)

def get_cos_similarities(base_img_path, other_image_paths):
    img = Image.open(base_img_path)
    # Work arround issue of some of the train images not being RGB, (they won't go through the CNN): 
    if img.mode != "RGB":
        print("Hack: converting {} to rgb...".format(base_img_path))
        rgbimg = Image.new("RGB", img.size)
        rgbimg.paste(img)
        img = rgbimg
    base_embedding = embedder.embed(img)
    similarities = np.zeros((len(other_image_paths)))
    
    for idx,img_path in enumerate(other_image_paths):
        #print("idx: ", idx, img_path)
        embedding = None
        img = Image.open(img_path)
        try:
            embedding = embedder.embed(img)
        except:
            print("error on image #", idx)
        if embedding is not None: 
#             cos_sim = cosine_similarity(base_embedding, embedding)
            cos = nn.CosineSimilarity(dim=1, eps=1e-6)
            cos_sim = cos(torch.Tensor(base_embedding).unsqueeze(0), torch.Tensor(embedding).unsqueeze(0))
            similarities[idx] = cos_sim.numpy()
    return similarities

def avg_category_similarities(dataset_path, cat1, cat2, sample_size=50):
    """
    Compute avereage similarities between all imgs in categories cat1 an cat2
    """
    cat1_imgs = glob.glob(dataset_path.format(cat1)+'*.jpg')
    cat2_imgs = glob.glob(dataset_path.format(cat2)+'*.jpg')
    # Sample a limited size to reduce compute time: 
    size1, size2 = min(len(cat1_imgs), sample_size), min(len(cat2_imgs), sample_size)
    cat1_imgs = np.random.choice(cat1_imgs, size=size1)
    cat2_imgs = np.random.choice(cat2_imgs, size=size2)
    sims = np.zeros((size1, size2))
    #print("similarites shape: ", sims.shape)
    for i,img1 in enumerate(cat1_imgs):
        sims[i,:] = get_cos_similarities(img1, cat2_imgs).T
        pass
    return sims


base_cat = "airplane"
other_cats = ["cow", "banana", "sink", "carrot", "umbrella"]

print("Testing intra-category cosine similarities for category: ", base_cat)
intra_cat_sims = avg_category_similarities('../images/cbas34_train/{}/', base_cat, base_cat)
print(
    "average '{}' intra-category similarity: {}\n".format(
        base_cat
        , np.average(intra_cat_sims)
    ))

for other_cat in other_cats: 
    print("Testing inter-category cosine similarities for '{}' vs '{}'..."\
          .format(base_cat, other_cat))
    inter_cat_sims = avg_category_similarities(
        '../images/cbas34_train/{}/', base_cat, other_cat
    )
    print("average {} cos-similarity vs '{}': {}\n".format(
        base_cat, other_cat, np.average(inter_cat_sims)
    ))


## NOTE: the error messages here are mostly due to some of the training 
## images in cbas-34 being 1 channel instead of 3. We should regenerate the 
## cbas set to be 3-channel to fix this. 

Testing intra-category cosine similarities for category:  airplane




average 'airplane' intra-category similarity: 0.40577869000807404

Testing inter-category cosine similarities for 'airplane' vs 'cow'...
average airplane cos-similarity vs 'cow': 0.14739956931676715

Testing inter-category cosine similarities for 'airplane' vs 'banana'...
average airplane cos-similarity vs 'banana': 0.11000944020226598

Testing inter-category cosine similarities for 'airplane' vs 'sink'...
average airplane cos-similarity vs 'sink': 0.14671501477624987

Testing inter-category cosine similarities for 'airplane' vs 'carrot'...
Hack: converting ../images/cbas34_train/airplane/156497.jpg to rgb...
average airplane cos-similarity vs 'carrot': 0.09939418648509309

Testing inter-category cosine similarities for 'airplane' vs 'umbrella'...
Hack: converting ../images/cbas34_train/airplane/247539.jpg to rgb...
average airplane cos-similarity vs 'umbrella': 0.1650637626306154



In [40]:
base_cat = "banana"
other_cats = ["cow", "airplane", "sink", "carrot", "umbrella"]


print("Testing intra-category cosine similarities for category: ", base_cat)
intra_cat_sims = avg_category_similarities('../images/cbas34_train/{}/', base_cat, base_cat)
print(
    "average '{}' intra-category similarity: {}\n".format(
        base_cat
        , np.average(intra_cat_sims)
    ))

for other_cat in other_cats: 
    print("Testing inter-category cosine similarities for '{}' vs '{}'..."\
          .format(base_cat, other_cat))
    inter_cat_sims = avg_category_similarities(
        '../images/cbas34_train/{}/', base_cat, other_cat
    )
    print("average {} cos-similarity vs '{}': {}\n".format(
        base_cat, other_cat, np.average(inter_cat_sims)
    ))

Testing intra-category cosine similarities for category:  banana




average 'banana' intra-category similarity: 0.46289090410619976

Testing inter-category cosine similarities for 'banana' vs 'cow'...
average banana cos-similarity vs 'cow': 0.1393634743908886

Testing inter-category cosine similarities for 'banana' vs 'airplane'...
average banana cos-similarity vs 'airplane': 0.09864027296435088

Testing inter-category cosine similarities for 'banana' vs 'sink'...
average banana cos-similarity vs 'sink': 0.17259873777907342

Testing inter-category cosine similarities for 'banana' vs 'carrot'...
average banana cos-similarity vs 'carrot': 0.22984664037674665

Testing inter-category cosine similarities for 'banana' vs 'umbrella'...
average banana cos-similarity vs 'umbrella': 0.16092501923255623



### What happens when we try embedding images from classes in the holdout set?

In the preceding cells we did a sanity check to verify that embeddings made some kind of sense, but those were all embeddings the network was trained on. What about when we embed images for classes the network was not trained on? Let's try it out by doing similar comparisons for categories in the cbas-LS set (i.e., holdout set). 

In [36]:
base_cat = "mouse"
other_cats = ["orange", "suitcase", "toilet", "elephant", "giraffe"]

print("Testing intra-category cosine similarities for category: ", base_cat)
intra_cat_sims = avg_category_similarities('../images/cbasLS_train/{}/', base_cat, base_cat)
print(
    "average '{}' intra-category similarity: {}\n".format(
        base_cat
        , np.average(intra_cat_sims)
    ))

for other_cat in other_cats: 
    print("Testing inter-category cosine similarities for '{}' vs '{}'..."\
          .format(base_cat, other_cat))
    inter_cat_sims = avg_category_similarities(
        '../images/cbasLS_train/{}/', base_cat, other_cat
    )
    print("average {} cos-similarity vs '{}': {}\n".format(
        base_cat, other_cat, np.average(inter_cat_sims)
    ))


Testing intra-category cosine similarities for category:  mouse




average 'mouse' intra-category similarity: 0.18886170259015633

Testing inter-category cosine similarities for 'mouse' vs 'orange'...
average mouse cos-similarity vs 'orange': 0.15418672954803334

Testing inter-category cosine similarities for 'mouse' vs 'suitcase'...
average mouse cos-similarity vs 'suitcase': 0.1660249532878399

Testing inter-category cosine similarities for 'mouse' vs 'toilet'...
average mouse cos-similarity vs 'toilet': 0.17293947569015436

Testing inter-category cosine similarities for 'mouse' vs 'elephant'...
average mouse cos-similarity vs 'elephant': 0.16750693448325618

Testing inter-category cosine similarities for 'mouse' vs 'giraffe'...
average mouse cos-similarity vs 'giraffe': 0.14247035264549776



In [41]:
base_cat = "orange"
other_cats = ["hotdog", "suitcase", "laptop", "scissors", "giraffe"]

print("Testing intra-category cosine similarities for category: ", base_cat)
intra_cat_sims = avg_category_similarities('../images/cbasLS_train/{}/', base_cat, base_cat)
print(
    "average '{}' intra-category similarity: {}\n".format(
        base_cat
        , np.average(intra_cat_sims)
    ))

for other_cat in other_cats: 
    print("Testing inter-category cosine similarities for '{}' vs '{}'..."\
          .format(base_cat, other_cat))
    inter_cat_sims = avg_category_similarities(
        '../images/cbasLS_train/{}/', base_cat, other_cat
    )
    print("average {} cos-similarity vs '{}': {}\n".format(
        base_cat, other_cat, np.average(inter_cat_sims)
    ))

Testing intra-category cosine similarities for category:  orange




average 'orange' intra-category similarity: 0.28105616680681705

Testing inter-category cosine similarities for 'orange' vs 'hotdog'...
average orange cos-similarity vs 'hotdog': 0.23702849502190948

Testing inter-category cosine similarities for 'orange' vs 'suitcase'...
average orange cos-similarity vs 'suitcase': 0.11700026734277405

Testing inter-category cosine similarities for 'orange' vs 'laptop'...
average orange cos-similarity vs 'laptop': 0.1536182081213221

Testing inter-category cosine similarities for 'orange' vs 'scissors'...
average orange cos-similarity vs 'scissors': 0.1544955761956051

Testing inter-category cosine similarities for 'orange' vs 'giraffe'...
average orange cos-similarity vs 'giraffe': 0.11841108537558903



In [42]:
base_cat = "pizza"
other_cats = ["hotdog", "remote", "laptop", "stopsign", "giraffe"]

print("Testing intra-category cosine similarities for category: ", base_cat)
intra_cat_sims = avg_category_similarities('../images/cbasLS_train/{}/', base_cat, base_cat)
print(
    "average '{}' intra-category similarity: {}\n".format(
        base_cat
        , np.average(intra_cat_sims)
    ))

for other_cat in other_cats: 
    print("Testing inter-category cosine similarities for '{}' vs '{}'..."\
          .format(base_cat, other_cat))
    inter_cat_sims = avg_category_similarities(
        '../images/cbasLS_train/{}/', base_cat, other_cat
    )
    print("average {} cos-similarity vs '{}': {}\n".format(
        base_cat, other_cat, np.average(inter_cat_sims)
    ))

Testing intra-category cosine similarities for category:  pizza




average 'pizza' intra-category similarity: 0.23508656190410257

Testing inter-category cosine similarities for 'pizza' vs 'hotdog'...
average pizza cos-similarity vs 'hotdog': 0.23441241003302857

Testing inter-category cosine similarities for 'pizza' vs 'remote'...
average pizza cos-similarity vs 'remote': 0.16776690724510698

Testing inter-category cosine similarities for 'pizza' vs 'laptop'...
average pizza cos-similarity vs 'laptop': 0.16989592276755722

Testing inter-category cosine similarities for 'pizza' vs 'stopsign'...
average pizza cos-similarity vs 'stopsign': 0.14201583838420775

Testing inter-category cosine similarities for 'pizza' vs 'giraffe'...
average pizza cos-similarity vs 'giraffe': 0.14807520844760583

