In [1]:
import torch
import torchvision

print(torch.__version__)
print(torchvision.__version__)

del torch
del torchvision

0.4.1
0.2.1


In [2]:
import sys
sys.path.append("/home/jupyter/pytorch-codes/fastai-course-2018/")

In [8]:
from fastai.conv_learner import *
from fastai.dataset import *

In [9]:
import json
from pathlib import Path
from PIL import ImageDraw, ImageFont

In [10]:
from matplotlib import patches, patheffects
%matplotlib inline

# helper functions

In [11]:
# We convert VOC's height/width into top-left/bottom-right, 
# and switch x/y coords to be consistent with numpy
def convert_bbox(bbox): 
    return np.array([bbox[1], bbox[0], 
                     (bbox[3] + bbox[1] - 1), (bbox[2] + bbox[0] - 1)])

In [12]:
def bbox_hw(bbox): 
    return np.array([bbox[1], bbox[0], 
                     (bbox[3] - bbox[1] + 1), (bbox[2] - bbox[0] + 1)])

In [13]:
def show_img(img, figsize=None, ax=None):
    if not ax: fig,ax = plt.subplots(figsize=figsize)
    ax.imshow(img)
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
    return ax

In [14]:
def draw_outline(patch, lw):
    patch.set_path_effects([patheffects.Stroke(linewidth=lw, foreground='black'), 
                            patheffects.Normal()])

In [15]:
def draw_rect(ax, bbox):
    patch = ax.add_patch(patches.Rectangle(bbox[:2], *bbox[-2:], 
                                           fill=False, edgecolor='white', lw=2))
    
    draw_outline(patch, 4)

In [16]:
def draw_text(ax, xy, category, text_size=14):
    text = ax.text(*xy, category, verticalalignment='top', 
                   color='white', fontsize=text_size, weight='bold')
    
    draw_outline(text, 1)

In [17]:
def draw_img(img, annotation):
    ax = show_img(img, figsize=(16,8))
    for bbox, label in annotation:
        bbox = bb_hw(bbox)
        draw_rect(ax, bbox)
        draw_text(ax, bbox[:2], id_to_categories[label], text_size=16)

In [18]:
def draw_id(_id):
    img_a = train_annotations[_id]
    img = open_image(image_paths/id_to_images[_id])
    draw_img(img, img_a)

In [19]:
def get_largest_bbox(bbox):
    if not bbox: raise Exception()
        
    bbox = sorted(bbox, 
                  key=lambda x: np.product(x[0][-2:] - x[0][:2]), 
                  reverse=True)
    
    return bbox[0]

## Pascal VOC

download links

https://storage.googleapis.com/coco-dataset/external/PASCAL_VOC.zip
http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
http://pjreddie.com/media/files/VOCtrainval_11-May-2012.tar

In [None]:
PATH = Path('/home/jupyter/data/pascal')
list(PATH.iterdir())

In [None]:
train_json = json.load((PATH/'pascal_train2007.json').open())
train_json.keys()

In [None]:
train_json['images'][0]

In [None]:
train_json['annotations'][0]

In [None]:
train_json['categories'][:5]

In [None]:
id_to_categories = {category["id"]: category['name'] 
              for category in train_json['categories']}

In [None]:
id_to_images = {image["id"] : image["file_name"] 
                for image in train_json["images"]}

In [None]:
train_ids = [image["id"] for image in train_json["images"]]

In [None]:
list((PATH/'VOCdevkit'/'VOC2007').iterdir())

In [None]:
image_paths = PATH/'VOCdevkit/VOC2007/JPEGImages'

In [None]:
list(image_paths.iterdir())[:5]

In [None]:
train_annotations = collections.defaultdict(lambda:[])

In [None]:
for annotation in train_json["annotations"]:
    if not annotation["ignore"]:
        bbox = annotation["bbox"]
        bbox = convert_bbox(bbox)
        train_annotations[annotation["image_id"]].append((bbox, annotation["category_id"]))

In [None]:
len(train_annotations)

In [None]:
# each image has a unique ID
img_0 = train_json["images"][0]
img_0["file_name"], img_0["id"]

In [None]:
im_0 = train_annotations[img_0["id"]]
im0_bbox , im0_label = im_0[0]

In [None]:
img = open_image(image_paths/img_0["file_name"])
img.shape

In [None]:
ax = show_img(img)
bbox = bbox_hw(im0_bbox)
draw_rect(ax, bbox)
draw_text(ax, bbox[:2], id_to_categories[im0_label])

In [None]:
draw_id(17)

## Largest item classifier

In [None]:
train_large_annotations = {}

In [None]:
for label, bbox in train_annotations.items():
    try:
        train_large_annotations[label] = get_largest_bbox(bbox)
    except:
        print(label)

In [None]:
bbox, label = train_large_annotations[23]
bbox = bb_hw(bbox)

ax = show_img(open_image(image_paths/id_to_images[23]), figsize=(5,10))

draw_rect(ax, bbox)
draw_text(ax, bbox[:2], id_to_categories[label], text_size=16)

Now we have a dictionary from image id to a single bounding box - the largest for that image.

In [None]:
b,c = trn_lrg_anno[23]
b = bb_hw(b)
ax = show_img(open_image(IMG_PATH/trn_fns[23]), figsize=(5,10))
draw_rect(ax, b)
draw_text(ax, b[:2], cats[c], sz=16)

In [None]:
(PATH/'tmp').mkdir(exist_ok=True)
CSV = PATH/'tmp/lrg.csv'

Often it's easiest to simply create a CSV of the data you want to model, rather than trying to create a custom dataset. Here we use Pandas to help us create a CSV of the image filename and class.

In [None]:
df = pd.DataFrame({'fn': [trn_fns[o] for o in trn_ids],
    'cat': [cats[trn_lrg_anno[o][1]] for o in trn_ids]}, columns=['fn','cat'])
df.to_csv(CSV, index=False)

In [None]:
f_model = resnet34
sz=224
bs=64

From here it's just like Dogs vs Cats!

In [None]:
tfms = tfms_from_model(f_model, sz, aug_tfms=transforms_side_on, crop_type=CropType.NO)
md = ImageClassifierData.from_csv(PATH, JPEGS, CSV, tfms=tfms, bs=bs)

In [None]:
x,y=next(iter(md.val_dl))

In [None]:
show_img(md.val_ds.denorm(to_np(x))[0]);

In [None]:
learn = ConvLearner.pretrained(f_model, md, metrics=[accuracy])
learn.opt_fn = optim.Adam

In [None]:
lrf=learn.lr_find(1e-5,100)

When you LR finder graph looks like this, you can ask for more points on each end:

In [None]:
learn.sched.plot()

In [None]:
learn.sched.plot(n_skip=5, n_skip_end=1)

In [None]:
lr = 2e-2

In [None]:
learn.fit(lr, 1, cycle_len=1)

In [None]:
lrs = np.array([lr/1000,lr/100,lr])

In [None]:
learn.freeze_to(-2)

In [None]:
lrf=learn.lr_find(lrs/1000)
learn.sched.plot(1)

In [None]:
learn.fit(lrs/5, 1, cycle_len=1)

In [None]:
learn.unfreeze()

Accuracy isn't improving much - since many images have multiple different objects, it's going to be impossible to be that accurate.

In [None]:
learn.fit(lrs/5, 1, cycle_len=2)

In [None]:
learn.save('clas_one')

In [None]:
learn.load('clas_one')

In [None]:
x,y = next(iter(md.val_dl))
probs = F.softmax(predict_batch(learn.model, x), -1)
x,preds = to_np(x),to_np(probs)
preds = np.argmax(preds, -1)

You can use the python debugger `pdb` to step through code.

- `pdb.set_trace()` to set a breakpoint
- `%debug` magic to trace an error

Commands you need to know:

- s / n / c
- u / d
- p
- l

In [None]:
fig, axes = plt.subplots(3, 4, figsize=(12, 8))
for i,ax in enumerate(axes.flat):
    ima=md.val_ds.denorm(x)[i]
    b = md.classes[preds[i]]
    ax = show_img(ima, ax=ax)
    draw_text(ax, (0,0), b)
plt.tight_layout()

It's doing a pretty good job of classifying the largest object!

## Bbox only

Now we'll try to find the bounding box of the largest object. This is simply a regression with 4 outputs. So we can use a CSV with multiple 'labels'.

In [None]:
BB_CSV = PATH/'tmp/bb.csv'

In [None]:
bb = np.array([trn_lrg_anno[o][0] for o in trn_ids])
bbs = [' '.join(str(p) for p in o) for o in bb]

df = pd.DataFrame({'fn': [trn_fns[o] for o in trn_ids], 'bbox': bbs}, columns=['fn','bbox'])
df.to_csv(BB_CSV, index=False)

In [None]:
BB_CSV.open().readlines()[:5]

In [None]:
f_model=resnet34
sz=224
bs=64

Set `continuous=True` to tell fastai this is a regression problem, which means it won't one-hot encode the labels, and will use MSE as the default crit.

Note that we have to tell the transforms constructor that our labels are coordinates, so that it can handle the transforms correctly.

Also, we use CropType.NO because we want to 'squish' the rectangular images into squares, rather than center cropping, so that we don't accidentally crop out some of the objects. (This is less of an issue in something like imagenet, where there is a single object to classify, and it's generally large and centrally located).

In [None]:
augs = [RandomFlip(), 
        RandomRotate(30),
        RandomLighting(0.1,0.1)]

In [None]:
tfms = tfms_from_model(f_model, sz, crop_type=CropType.NO, aug_tfms=augs)
md = ImageClassifierData.from_csv(PATH, JPEGS, BB_CSV, tfms=tfms, continuous=True, bs=4)

In [None]:
idx=3
fig,axes = plt.subplots(3,3, figsize=(9,9))
for i,ax in enumerate(axes.flat):
    x,y=next(iter(md.aug_dl))
    ima=md.val_ds.denorm(to_np(x))[idx]
    b = bb_hw(to_np(y[idx]))
    print(b)
    show_img(ima, ax=ax)
    draw_rect(ax, b)

In [None]:
augs = [RandomFlip(tfm_y=TfmType.COORD),
        RandomRotate(30, tfm_y=TfmType.COORD),
        RandomLighting(0.1,0.1, tfm_y=TfmType.COORD)]

In [None]:
tfms = tfms_from_model(f_model, sz, crop_type=CropType.NO, tfm_y=TfmType.COORD, aug_tfms=augs)
md = ImageClassifierData.from_csv(PATH, JPEGS, BB_CSV, tfms=tfms, continuous=True, bs=4)

In [None]:
idx=3
fig,axes = plt.subplots(3,3, figsize=(9,9))
for i,ax in enumerate(axes.flat):
    x,y=next(iter(md.aug_dl))
    ima=md.val_ds.denorm(to_np(x))[idx]
    b = bb_hw(to_np(y[idx]))
    print(b)
    show_img(ima, ax=ax)
    draw_rect(ax, b)

In [None]:
tfm_y = TfmType.COORD
augs = [RandomFlip(tfm_y=tfm_y),
        RandomRotate(3, p=0.5, tfm_y=tfm_y),
        RandomLighting(0.05,0.05, tfm_y=tfm_y)]

tfms = tfms_from_model(f_model, sz, crop_type=CropType.NO, tfm_y=tfm_y, aug_tfms=augs)
md = ImageClassifierData.from_csv(PATH, JPEGS, BB_CSV, tfms=tfms, bs=bs, continuous=True)

fastai let's you use a `custom_head` to add your own module on top of a convnet, instead of the adaptive pooling and fully connected net which is added by default. In this case, we don't want to do any pooling, since we need to know the activations of each grid cell.

The final layer has 4 activations, one per bounding box coordinate. Our target is continuous, not categorical, so the MSE loss function used does not do any sigmoid or softmax to the module outputs.

In [None]:
512*7*7

In [None]:
head_reg4 = nn.Sequential(Flatten(), nn.Linear(25088,4))
learn = ConvLearner.pretrained(f_model, md, custom_head=head_reg4)
learn.opt_fn = optim.Adam
learn.crit = nn.L1Loss()

In [None]:
learn.summary()

In [None]:
learn.lr_find(1e-5,100)
learn.sched.plot(5)

In [None]:
lr = 2e-3

In [None]:
learn.fit(lr, 2, cycle_len=1, cycle_mult=2)

In [None]:
lrs = np.array([lr/100,lr/10,lr])

In [None]:
learn.freeze_to(-2)

In [None]:
lrf=learn.lr_find(lrs/1000)
learn.sched.plot(1)

In [None]:
learn.fit(lrs, 2, cycle_len=1, cycle_mult=2)

In [None]:
learn.freeze_to(-3)

In [None]:
learn.fit(lrs, 1, cycle_len=2)

In [None]:
learn.save('reg4')

In [None]:
learn.load('reg4')

In [None]:
x,y = next(iter(md.val_dl))
learn.model.eval()
preds = to_np(learn.model(VV(x)))

In [None]:
fig, axes = plt.subplots(3, 4, figsize=(12, 8))
for i,ax in enumerate(axes.flat):
    ima=md.val_ds.denorm(to_np(x))[i]
    b = bb_hw(preds[i])
    ax = show_img(ima, ax=ax)
    draw_rect(ax, b)
plt.tight_layout()

## Single object detection

In [None]:
f_model=resnet34
sz=224
bs=64

val_idxs = get_cv_idxs(len(trn_fns))

In [None]:
tfms = tfms_from_model(f_model, sz, crop_type=CropType.NO, tfm_y=TfmType.COORD, aug_tfms=augs)
md = ImageClassifierData.from_csv(PATH, JPEGS, BB_CSV, tfms=tfms,
   bs=bs, continuous=True, val_idxs=val_idxs)

In [None]:
md2 = ImageClassifierData.from_csv(PATH, JPEGS, CSV, tfms=tfms_from_model(f_model, sz))

A dataset can be anything with `__len__` and `__getitem__`. Here's a dataset that adds a 2nd label to an existing dataset:

In [None]:
class ConcatLblDataset(Dataset):
    def __init__(self, ds, y2): self.ds,self.y2 = ds,y2
    def __len__(self): return len(self.ds)
    
    def __getitem__(self, i):
        x,y = self.ds[i]
        return (x, (y,self.y2[i]))

We'll use it to add the classes to the bounding boxes labels.

In [None]:
trn_ds2 = ConcatLblDataset(md.trn_ds, md2.trn_y)
val_ds2 = ConcatLblDataset(md.val_ds, md2.val_y)

In [None]:
val_ds2[0][1]

We can replace the dataloaders' datasets with these new ones.

In [None]:
md.trn_dl.dataset = trn_ds2
md.val_dl.dataset = val_ds2

We have to `denorm`alize the images from the dataloader before they can be plotted.

In [None]:
x,y=next(iter(md.val_dl))
idx=3
ima=md.val_ds.ds.denorm(to_np(x))[idx]
b = bb_hw(to_np(y[0][idx])); b

In [None]:
ax = show_img(ima)
draw_rect(ax, b)
draw_text(ax, b[:2], md2.classes[y[1][idx]])

We need one output activation for each class (for its probability) plus one for each bounding box coordinate. We'll use an extra linear layer this time, plus some dropout, to help us train a more flexible model.

In [None]:
head_reg4 = nn.Sequential(
    Flatten(),
    nn.ReLU(),
    nn.Dropout(0.5),
    nn.Linear(25088,256),
    nn.ReLU(),
    nn.BatchNorm1d(256),
    nn.Dropout(0.5),
    nn.Linear(256,4+len(cats)),
)
models = ConvnetBuilder(f_model, 0, 0, 0, custom_head=head_reg4)

learn = ConvLearner(md, models)
learn.opt_fn = optim.Adam

In [None]:
def detn_loss(input, target):
    bb_t,c_t = target
    bb_i,c_i = input[:, :4], input[:, 4:]
    bb_i = F.sigmoid(bb_i)*224
    # I looked at these quantities separately first then picked a multiplier
    #   to make them approximately equal
    return F.l1_loss(bb_i, bb_t) + F.cross_entropy(c_i, c_t)*20

def detn_l1(input, target):
    bb_t,_ = target
    bb_i = input[:, :4]
    bb_i = F.sigmoid(bb_i)*224
    return F.l1_loss(V(bb_i),V(bb_t)).data

def detn_acc(input, target):
    _,c_t = target
    c_i = input[:, 4:]
    return accuracy(c_i, c_t)

learn.crit = detn_loss
learn.metrics = [detn_acc, detn_l1]

In [None]:
learn.lr_find()
learn.sched.plot()

In [None]:
lr=1e-2

In [None]:
learn.fit(lr, 1, cycle_len=3, use_clr=(32,5))

In [None]:
learn.save('reg1_0')

In [None]:
learn.freeze_to(-2)

In [None]:
lrs = np.array([lr/100, lr/10, lr])

In [None]:
learn.lr_find(lrs/1000)
learn.sched.plot(0)

In [None]:
learn.fit(lrs/5, 1, cycle_len=5, use_clr=(32,10))

In [None]:
learn.save('reg1_1')

In [None]:
learn.load('reg1_1')

In [None]:
learn.unfreeze()

In [None]:
learn.fit(lrs/10, 1, cycle_len=10, use_clr=(32,10))

In [None]:
learn.save('reg1')

In [None]:
learn.load('reg1')

In [None]:
y = learn.predict()
x,_ = next(iter(md.val_dl))

In [None]:
from scipy.special import expit

In [None]:
fig, axes = plt.subplots(3, 4, figsize=(12, 8))
for i,ax in enumerate(axes.flat):
    ima=md.val_ds.ds.denorm(to_np(x))[i]
    bb = expit(y[i][:4])*224
    b = bb_hw(bb)
    c = np.argmax(y[i][4:])
    ax = show_img(ima, ax=ax)
    draw_rect(ax, b)
    draw_text(ax, b[:2], md2.classes[c])
plt.tight_layout()

## Visual Studio Code

- Command palette (<kbd>Ctrl-shift-p</kbd>)
- Select interpreter (for fastai env)
- Select terminal shell
- Go to symbol (<kbd>Ctrl-t</kbd>)
- Find references (<kbd>Shift-F12</kbd>)
- Go to definition (<kbd>F12</kbd>)
- Go back (<kbd>alt-left</kbd>)
- View documentation
- Hide sidebar (<kbd>Ctrl-b</kbd>)
- Zen mode (<kbd>Ctrl-k,z</kbd>)