https://github.com/google-research/google-research/tree/master/taperception

https://dl.acm.org/doi/fullHtml/10.1145/3491102.3517497#BibPLXBIB0037

Adding notebook to github for version control. 
Download images from google drive to local, or run in Google colab and mount drive.

In [None]:
pip install torchvision numpy pandas scikit-image

In [8]:
#Imports 
import pandas as pd
import json
import numpy as np
import re
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
import torchvision
from torchvision import datasets, models, transforms, utils
from torch.utils.data import Dataset, DataLoader
from skimage import io, transform
import torchvision.models as models

In [9]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
torch.manual_seed(42)

<torch._C.Generator at 0x7fab309dec90>

In [10]:
dataset = pd.read_csv('https://raw.githubusercontent.com/google-research-datasets/taperception/main/rico_tap_annotations_idsonly.csv')

json_path = #ADD PATH
img_list_train = #ADD PATH
img_list_test = #ADD PATH
test_image_dir = #ADD PATH
train_image_dir = #ADD PATH

In [11]:
dataset.head()

Unnamed: 0,img_id,object_id,label,raters_marked_tappable,split
0,48694,baf602f,0,5,train
1,57047,7263013,0,4,train
2,8663,5707070,0,5,train
3,57274,4019ce3,1,1,train
4,69212,1e61f08,0,3,train


In [12]:
dataset.describe()

Unnamed: 0,img_id,label,raters_marked_tappable
count,18667.0,18667.0,18667.0
mean,37148.980661,0.32678,3.282316
std,21074.699714,0.469049,1.711987
min,3.0,0.0,0.0
25%,19060.0,0.0,2.0
50%,38054.0,0.0,4.0
75%,55180.0,1.0,5.0
max,72210.0,1.0,5.0


In [13]:
class image:
    
    def __init__(self, dic, object_id):
        self.dic = dic
   
        #image 
        self.image_bounds = self.dic['activity']['root']['rel-bounds']
        self.image_bounds_height = self.image_bounds[3]
        self.image_bounds_width = self.image_bounds[2]
        
        #object
        self.object_bounds = []
        self.object_id = object_id
        self.find_object_bounds(self.dic)
    
    def find_object_bounds(self, dic):
        for key in dic:
            if key == 'pointer' and dic['pointer'] == self.object_id:
                self.object_bounds = dic['rel-bounds']
            if isinstance(dic[key], dict):
                self.find_object_bounds(dic[key])
            elif isinstance(dic[key], list):
                self.find_object_bounds_lst(dic[key])

    def find_object_bounds_lst(self, lst):
        for item in lst:
            if isinstance(item, dict):
                self.find_object_bounds(item)
            elif isinstance(item, list):
                self.find_object_bounds_lst(item)
                
    def get_object_bounds(self):
        return self.object_bounds
    
    def get_image_bounds_height(self):
        return self.image_bounds_height

    def get_image_bounds_width(self):
        return self.image_bounds_width

In [14]:
def get_object_bounds(image_id, object_id):
    json_file = open(json_path + str(image_id) + '.json')
    image_json = json.load(json_file)
    ij = image(image_json, object_id)
    return ij.get_object_bounds(), ij.get_image_bounds_height(), ij.get_image_bounds_width()

In [15]:
class applyMask(object):

    def __call__(self, sample):
        
        image = sample['image']
        
        object_bounds, height, width = get_object_bounds(sample['image_id'], sample['object_id'])
        
        binary_mask = np.zeros(shape=(image.shape[0], image.shape[1]))
        x_ratio_min = object_bounds[0]/width
        x_ratio_max = object_bounds[2]/width
        y_ratio_min = object_bounds[1]/height
        y_ratio_max = object_bounds[3]/height
        
        for x in range(image.shape[0]):
            for y in range(image.shape[1]):
                if x_ratio_min <= x/image.shape[0] < x_ratio_max and y_ratio_min <= y/image.shape[1] < y_ratio_max:
                    binary_mask[x,y] = 1
        concat = np.dstack((image, binary_mask))
    
        return {'image': concat, 'label': sample['label']}
    
class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, sample):
        image, label = sample['image'], sample['label']

        # swap color axis because
        # numpy image: H x W x C
        # torch image: C x H x W
        image = image.transpose((2, 0, 1))
        return {'image': torch.from_numpy(image),
                'label': label}
    

In [16]:
class Tappable(Dataset):

    def __init__(self, img_list, root_dir, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.img_list = pd.read_csv(img_list)
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.img_list)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_name = os.path.join(self.root_dir,
                                self.img_list.iloc[idx, 0])
        
        image = io.imread(img_name)
        image = transform.resize(image, (960, 540))
        
        image_id = re.match(r"(\d+)_*",self.img_list.iloc[idx, 0]).group(1)
        object_id = re.match(r"\d+_(.+).jpg",self.img_list.iloc[idx, 0]).group(1)
        
        label = dataset.loc[(dataset['img_id']==int(image_id)) & (dataset['object_id']==str(object_id))]['label']
        
        sample = {'image': image, 'image_id': image_id, 'object_id': object_id, 'label': int(label) }
        
        if self.transform:
            sample_out = self.transform(sample)

        return sample_out

In [17]:
dataset_train = Tappable(img_list= img_list_train,
                         root_dir= train_image_dir,
                         transform=transforms.Compose([
                         applyMask(),
                         ToTensor()
                         ]))

dataset_test = Tappable(img_list=img_list_test,
                        root_dir= test_image_dir,
                        transform=transforms.Compose([
                        applyMask(),
                        ToTensor()
                        ]))
#batch size should be 1024
dataloader_train = DataLoader(dataset_train,batch_size=20, shuffle=True, pin_memory=True if DEVICE == "cuda" else False)
dataloader_test = DataLoader(dataset_test,batch_size=20, shuffle=True, pin_memory=True if DEVICE == "cuda" else False)

In [19]:
class Block(nn.Module):
    def __init__(self, num_layers, in_channels, out_channels, identity_downsample=None, stride=1):
        super(Block, self).__init__()
        self.num_layers = num_layers
        self.expansion = 1
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.conv3 = nn.Conv2d(out_channels, out_channels * self.expansion, kernel_size=1, stride=1, padding=0)
        self.bn3 = nn.BatchNorm2d(out_channels * self.expansion)
        self.relu = nn.ReLU()
        self.identity_downsample = identity_downsample

    def forward(self, x):
        identity = x
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.conv3(x)
        x = self.bn3(x)

        if self.identity_downsample is not None:
            identity = self.identity_downsample(identity)

        x += identity
        x = self.relu(x)
        return x

class ResNet(nn.Module):
    def __init__(self, num_layers, block, image_channels, num_classes):
        super(ResNet, self).__init__()
        self.expansion = 1
        layers = [2, 2, 2, 2] #resnet 18
        self.in_channels = 64
        self.conv1 = nn.Conv2d(image_channels, 64, kernel_size=7, stride=2, padding=3)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        # ResNetLayers
        self.layer1 = self.make_layers(num_layers, block, layers[0], intermediate_channels=64, stride=1)
        self.layer2 = self.make_layers(num_layers, block, layers[1], intermediate_channels=128, stride=2)
        self.layer3 = self.make_layers(num_layers, block, layers[2], intermediate_channels=256, stride=2)
        self.layer4 = self.make_layers(num_layers, block, layers[3], intermediate_channels=512, stride=2)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * self.expansion, num_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.reshape(x.shape[0], -1)
        x = self.fc(x)
        return x

    def make_layers(self, num_layers, block, num_residual_blocks, intermediate_channels, stride):
        layers = []

        identity_downsample = nn.Sequential(nn.Conv2d(self.in_channels, intermediate_channels*self.expansion, kernel_size=1, stride=stride),
                                            nn.BatchNorm2d(intermediate_channels*self.expansion))
        layers.append(block(num_layers, self.in_channels, intermediate_channels, identity_downsample, stride))
        self.in_channels = intermediate_channels * self.expansion
        for i in range(num_residual_blocks - 1):
            layers.append(block(num_layers, self.in_channels, intermediate_channels)) 
        return nn.Sequential(*layers)

def ResNet18(img_channels=4, num_classes=1000):
    return ResNet(18, Block, img_channels, num_classes)

In [20]:
model = ResNet18()
model = model.to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.05, nesterov=True, momentum = 0.9)

lambda_lr = lambda epoch: 0.1 ** epoch if epoch in [100, 500, 1000, 1300] else 1

lr_scheduler = optim.lr_scheduler.LambdaLR(
   optimizer=optimizer,
   lr_lambda = lambda_lr
)

In [24]:
n_epochs = 3 #should be 1500
test_loss = []
train_loss = []
total_step = len(dataloader_train)
valid_loss_min = np.Inf

for epoch in range(1, n_epochs+1):
    model.train()
    running_loss = 0.0
    print(f'Epoch {epoch}\n')
    for batch_idx, item in enumerate(dataloader_train):
        inputs, labels = item['image'].type(torch.FloatTensor).to(DEVICE), item['label'].type(torch.LongTensor).to(DEVICE)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if (batch_idx) % 20 == 0:
            print('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}'
                   .format(epoch, n_epochs, batch_idx, total_step, loss.item()))
    train_loss.append(running_loss/total_step)
    lr_scheduler.step()

    model.eval()
    batch_loss = 0
    for batch_idx, item in enumerate(dataloader_test):
      data_t, target_t = item['image'].type(torch.FloatTensor).to(DEVICE), item['label'].type(torch.LongTensor).to(DEVICE)
      outputs_t = model(data_t)
      loss_t = criterion(outputs_t, target_t)
      batch_loss += loss_t.item()
      if (batch_idx) % 20 == 0:
            print('Epoch [{}/{}], Step [{}/{}], Test Loss: {:.4f}'
                   .format(epoch, n_epochs, batch_idx, len(dataloader_test), loss_t.item()))

    test_loss.append(batch_loss/len(dataloader_test))

    print(f'\ntrain-loss: {np.mean(train_loss):.4f}, test-loss: {np.mean(test_loss):.4f}')

    network_learned = batch_loss < valid_loss_min

    if network_learned:
          valid_loss_min = batch_loss
          torch.save(model.state_dict(), 'resnet.pt')
          print('Improvement-Detected, save-model')

Epoch 1

Epoch [1/3], Step [0/102], Train Loss: 1.5676
Epoch [1/3], Step [20/102], Train Loss: 2.1983


In [None]:
torch.save(model.state_dict(), 'resnet2.pt')