In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as tvt
from torch.utils.data.sampler import SubsetRandomSampler
import torch_directml
import matplotlib.pyplot as plt
from pycocotools.coco import COCO
import numpy as np
from PIL import Image
import random
import os
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import functools

# use directml to run codes on AMD GPU
dml = torch_directml.device()
dml

device(type='privateuseone', index=0)

In [None]:
def plot(imgs, row_title=None, **imshow_kwargs):
    if not isinstance(imgs[0], list):
        # Make a 2d grid even if there's just 1 row
        imgs = [imgs]

    num_rows = len(imgs)
    num_cols = len(imgs[0])
    fig, axs = plt.subplots(nrows=num_rows, ncols=num_cols, squeeze=False)
    for row_idx, row in enumerate(imgs):
        for col_idx, img in enumerate(row):
            ax = axs[row_idx, col_idx]
            if isinstance(img, Image.Image):
                ax.imshow(img, **imshow_kwargs)
            else:
                ax.imshow(np.asarray(img), **imshow_kwargs)
            ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])

    if row_title is not None:
        for row_idx in range(num_rows):
            axs[row_idx, 0].set(ylabel=row_title[row_idx])

    plt.tight_layout()

In [4]:
annType = []
# annType = annType[1]      #specify type here
catType = ['airplane','bus','cat','dog','pizza']
prefix = 'person_keypoints' if annType=='keypoints' else 'instances'
print('Running demo for *%s* results.'%(annType))
dataDir='./coco'
dataType='train2014'
annFile = '%s/annotations/%s_%s.json'%(dataDir,prefix,dataType)
cocoGt=COCO(annFile)

Running demo for *[]* results.
loading annotations into memory...
Done (t=7.45s)
creating index...
index created!


In [5]:
cats = cocoGt.loadCats(cocoGt.getCatIds())
nms=[cat['name'] for cat in cats]
print('COCO categories: \n{}\n'.format(' '.join(nms)))
cocoGt.getCatIds(catNms=['pizza'])

COCO categories: 
person bicycle car motorcycle airplane bus train truck boat traffic light fire hydrant stop sign parking meter bench bird cat dog horse sheep cow elephant bear zebra giraffe backpack umbrella handbag tie suitcase frisbee skis snowboard sports ball kite baseball bat baseball glove skateboard surfboard tennis racket bottle wine glass cup fork knife spoon bowl banana apple sandwich orange broccoli carrot hot dog pizza donut cake chair couch potted plant bed dining table toilet tv laptop mouse remote keyboard cell phone microwave oven toaster sink refrigerator book clock vase scissors teddy bear hair drier toothbrush



[59]

In [6]:
imgId = cocoGt.getImgIds(catIds=[59])[2]
cocoGt.getAnnIds(imgIds=[imgId])
cocoGt.loadImgs(imgId)[0]

{'license': 1,
 'file_name': 'COCO_train2014_000000548874.jpg',
 'coco_url': 'http://images.cocodataset.org/train2014/COCO_train2014_000000548874.jpg',
 'height': 480,
 'width': 640,
 'date_captured': '2013-11-19 21:14:26',
 'flickr_url': 'http://farm1.staticflickr.com/139/404544387_58a31e803d_z.jpg',
 'id': 548874}

In [10]:
cocoGt.loadAnns(cocoGt.getAnnIds(imgIds=imgId))[3]

{'segmentation': [[380.32,
   126.69,
   379.25,
   123.92,
   372.86,
   119.45,
   371.58,
   112.42,
   377.33,
   103.47,
   383.94,
   98.78,
   384.58,
   97.93,
   395.23,
   103.25,
   401.62,
   107.73,
   403.54,
   116.04,
   403.11,
   121.36,
   400.34,
   123.07,
   403.33,
   124.99,
   405.03,
   128.39,
   402.69,
   133.08,
   399.7,
   136.28,
   402.47,
   143.31,
   399.28,
   147.57,
   390.97,
   146.29,
   385.43,
   140.32,
   382.66,
   135.21,
   381.17,
   129.46]],
 'area': 1039.02765,
 'iscrowd': 0,
 'image_id': 548874,
 'bbox': [371.58, 97.93, 33.45, 49.64],
 'category_id': 56,
 'id': 1562108}

In [36]:
imginfo = cocoGt.loadImgs(548874)[0]
cocoGt.loadAnns(cocoGt.getAnnIds(imgIds=imgId))[2]['area']

1126.0780500000008

In [35]:
catIds = cocoGt.getCatIds(catNms=['bus', 'cat', 'pizza'])
sets = [set(cocoGt.getImgIds(catIds=catId)) for catId in catIds]
imgIds = functools.reduce(lambda a, b: a.union(b), sets)
len(imgIds)

7799

In [44]:
class DataInfo:
    def __init__(self, dir='./coco', *, type='train2014', categories=None) -> None:
        self.dir  = dir
        self.type = type
        self.annFile = '%s/annotations/%s_%s.json'%(self.dir,'instances',self.type)
        # target images' information:
        self.ctgs = categories
        self.h = 256
        self.w = 256
        self.doArea = 40000

def center_to_corner(cbox):
    return [cbox[0], cbox[0]+cbox[2], \
            cbox[1], cbox[1]+cbox[3]]

class MyDataset(torch.utils.data.Dataset):
    xform = tvt.Compose([
        tvt.ToTensor(),
        # transform to range [-1, 1]:
        tvt.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
    ])
    def __init__(self, data: DataInfo, *, save_dir="./resized", \
            update=False
        ):

        cocoGt   = COCO(data.annFile)
        self.coco= cocoGt
        catType  = data.ctgs
        self.catIds = cocoGt.getCatIds(catNms=catType)
        self.dir = save_dir
        self.dataInfo = data
        self.catId_to_label = {cocoGt.getCatIds(catType[i])[0]: i  for i in range(len(catType))}
        self.label_to_cat = {i: catType[i] for i in range(len(catType))}

        self.anns = self.gen_data_id(cocoGt, catType, update)


    # return data and label(it is actually the categoryID)
    def gen_data_id(self, cocoGt, catType, update=False):
        catIds = cocoGt.getCatIds(catNms=catType)
        sets = [set(cocoGt.getImgIds(catIds=[catId])) for catId in catIds]
        imgIds = functools.reduce(lambda a, b: a.union(b), sets)
        anns = []
        for imgId in imgIds:
            anns = cocoGt.loadAnns(cocoGt.getAnnIds(imgIds=imgId, iscrowd=False))
            for ann in anns:
                if ann['category_id'] in self.catIds \
                and ann['area'] >= self.dataInfo.doArea:
                    anns.append(ann['id'])
                    self.gen_resized_image(imgId, update)
                    # break inner for-loop
                    break
            # switch to next image
        return anns

    def resize(self, im, bbox):
        w_ori = im['width']
        h_ori = im['height']
        # xi, yi are in range [0,1]
        new_box = [bbox[0]/w_ori, bbox[1]/h_ori, \
                   bbox[2]/w_ori, bbox[3]/h_ori]
        return new_box

    def gen_resized_image(self, imgId, update):
        im = self.coco.loadImgs(imgId)[0]
        orig_path = '%s/%s/%s'%(self.dataInfo.dir, self.dataInfo.type, im['file_name'])
        save_path = '%s/%s'%(self.dir, im['file_name'])
        img = Image.open(orig_path)
        if img.mode != 'RGB':
            # force update if it is not RGB
            img = img.convert('RGB')
        if update or not os.path.exists(save_path):
            img = img.resize((self.dataInfo.w, self.dataInfo.h), resample=Image.Resampling.LANCZOS)
            img.save(save_path)

    def __len__(self):
        return len(self.anns)

    def __getitem__(self, index):
        ann = self.coco.loadAnns(self.anns[index])[0]
        im  = self.coco.loadImgs(ann['image_id'])[0]
        path = '%s/%s/%s'%(self.dataInfo.dir, self.dataInfo.type, im['file_name'])
        pil_image = Image.open(path)
        tensor_img = self.xform(pil_image)
        bbox = ann['bbox']
        tensor_lab = self.catId_to_label[ann['category_id']] 
        new_bbox = center_to_corner(self.resize(im, bbox))
        new_bbox = torch.tensor(new_bbox, dtype=torch.float)
        return tensor_img, tensor_lab, new_bbox

datainfo = DataInfo(type='train2014', categories=['bus', 'cat', 'pizza'])
dataset  = MyDataset(datainfo, update=False)

print("done!")

loading annotations into memory...
Done (t=5.84s)
creating index...
index created!


In [None]:
class SkipBlock(nn.Module):
    def __init__(self, in_ch, out_ch, ker=3, *, stride=1, padding=1) -> None:
        super().__init__()
        self.in_ch = in_ch
        self.out_ch = out_ch
        self.stride = stride
        self.conv1 = nn.Conv2d(in_ch, out_ch, ker, stride=stride, padding=padding)
        self.conv2 = nn.Conv2d(out_ch, out_ch, 3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(out_ch)
        self.bn2 = nn.BatchNorm2d(out_ch)
        self.relu = nn.ReLU(inplace=True)
        if stride != 1 or in_ch != out_ch:
            # would have bugs if ker!=3
            self.downsampler = nn.Sequential(
                nn.Conv2d(in_ch, out_ch, 1, stride=stride, padding=0, bias=False),
                nn.BatchNorm2d(out_ch),
            )
        else:
            self.downsampler = None

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        if self.downsampler is not None:
            identity = self.downsampler(x)
        out = out + identity
        out = self.relu(out)

        return out
    
class HW5Net(nn.Module):

    def __init__(self, in_ch=3, out_ch=3, ngf=16):
        super(HW5Net, self).__init__()

        # The first convolution layer. Assuing (B, 3, 256, 256) to the input.
        model = nn.ModuleList([
            nn.ReflectionPad2d(3),
            nn.Conv2d(in_ch, ngf, 7, padding=0),
            nn.BatchNorm2d(ngf),
            nn.ReLU(inplace=True),
        ])
        # out_size: 256

        # The second convolution layer, downsample only once before skip-block
        model.extend([
            nn.ReflectionPad2d(2),
            nn.Conv2d(ngf, ngf * 2, 5, stride=3, padding=0),
            nn.BatchNorm2d(ngf * 2),
            nn.ReLU(inplace=True),
        ])
        # out_size: 86

        # The skip-blocks
        new_in_ch = ngf * 2
        num_blocks = [3, 3, 4, 2]
        new_out_chs = [32, 64, 128, 256]
        for i in range(len(num_blocks)):
            new_out_ch = new_out_chs[i]
            num_block  = num_blocks[i]
            model.extend(
                self._gen_skip_blocks(new_in_ch, new_out_ch, num_block, stride=2, padding=1)
            )
            new_in_ch = new_out_ch
        # out_size: 6

        model.extend(nn.AvgPool2d(3, stride=3, padding=0))
        # out_size: 256x2x2

        # The classification head
        class_head = [
            nn.Linear(256 * 2 * 2, 256),
            nn.ReLU(inplace=True),
            nn.Linear(256, out_ch),
        ]
        self.class_head = nn.Sequential(*class_head)

        # The regression head
        new_in_ch = new_out_chs[-1]
        new_out_chs = [ ]
        bbox_head = [

        ]

    @staticmethod
    def _gen_skip_blocks(in_ch, out_ch, num_layer, *, stride=1, padding=1):
        # the first skip-block will downsample the input if necessary.
        layers = [SkipBlock(in_ch, out_ch, stride=stride, padding=padding),]
        for _ in range(1, num_layer):
            # the following skip-blocks will keep the input size unchanged.
            layers.append(SkipBlock(out_ch, out_ch, stride=1, padding=1))
        return layers

        




In [14]:
def calc(x, ker=3, *, stride=1, padding=1):
    return ((x+2*padding-ker) / stride + 1.0)//1

# calc(256, 5, stride=3, padding=1)
calc(256, 5, stride=3, padding=2)
calc(11, 3, stride=2, padding=0)

5.0