In [1]:
!pip install timm

Collecting timm
  Downloading timm-0.4.12-py3-none-any.whl (376 kB)
[?25l[K     |▉                               | 10 kB 20.8 MB/s eta 0:00:01[K     |█▊                              | 20 kB 24.1 MB/s eta 0:00:01[K     |██▋                             | 30 kB 9.5 MB/s eta 0:00:01[K     |███▌                            | 40 kB 10.3 MB/s eta 0:00:01[K     |████▍                           | 51 kB 9.6 MB/s eta 0:00:01[K     |█████▏                          | 61 kB 10.8 MB/s eta 0:00:01[K     |██████                          | 71 kB 8.4 MB/s eta 0:00:01[K     |███████                         | 81 kB 9.2 MB/s eta 0:00:01[K     |███████▉                        | 92 kB 9.9 MB/s eta 0:00:01[K     |████████▊                       | 102 kB 10.4 MB/s eta 0:00:01[K     |█████████▋                      | 112 kB 10.4 MB/s eta 0:00:01[K     |██████████▍                     | 122 kB 10.4 MB/s eta 0:00:01[K     |███████████▎                    | 133 kB 10.4 MB/s eta 0:00:01[K

In [2]:
import os
import torchvision
import torch
from torch import nn
import cv2
import numpy as np
import timm
import urllib
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
import shutil

In [3]:
class CFG:
    img_size = (1024, 1024)

In [4]:
df = pd.read_csv("https://raw.githubusercontent.com/namnv78/similaritymatching/tuha5/facebook_matching/src/df_train.csv")

In [5]:
%%capture
shutil.copy("/content/drive/MyDrive/Viettel DTalent/ViettelTalentsPhase2/Phan Nhật Minh/fb_toy_ds.zip", "fb_toy_ds.zip")
!unzip "fb_toy_ds.zip"

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
def draw_box(img, boxes):
    for box, label, score in zip(boxes['boxes'], boxes['labels'], boxes['scores']):
        x0,y0,x1,y1 = box.detach().cpu().numpy()
        img = cv2.rectangle(img, (x0,y0),(x1,y1), (0,255,0), 2)
    return img

def url_to_image(url,size=(600, 600)):
    resp = urllib.request.urlopen(url)
    img = np.asarray(bytearray(resp.read()), dtype="uint8")
    img = cv2.imdecode(img, cv2.IMREAD_COLOR)
    img = cv2.resize(img, size)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    # img = img/255
    
    return img

def read_img(img_path, size=(600, 600)):
    img = cv2.imread(img_path)
    img = cv2.resize(img, size)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    # img = img/255
    return img

# imgs = [url_to_image('http://image.vietnamnews.vn/uploadvnnews/Storage/Images/2015/10/7/21a.jpg'), 
#         url_to_image('https://media-cdn.tripadvisor.com/media/photo-s/15/26/b8/f3/generala-drapsina-street.jpg')]

# img_tensor = torch.stack([torch.tensor(im, dtype=torch.float) for im in imgs])
# img_tensor = img_tensor.permute(0, 3, 1, 2)
# img_tensor.shape # B x C x H x W

In [8]:
def process_bbox(detected, iou_thresh=0.5, score_thresh=0.5):
    '''
        Perform nms and remove low score boxes
        Args:
            detected: Dictionary for boxes detected in an image
                boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, 
                    with ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
                labels (Int64Tensor[N]): the predicted labels for each image
                scores (Tensor[N]): the scores or each prediction
            iou_thresh: iou threshold for nms use, value range (0, 1)
            score_thresh: box count as valid if its score>score_thresh, value range (0, 1)
        Returns:
            Valid boxes dict which have the same structure as input
    '''
    valid_boxes = {'boxes':[], 'labels':[], 'scores':[]}

    keep_idx = torchvision.ops.nms(detected['boxes'], detected['scores'], iou_threshold=iou_thresh)
    for idx in keep_idx.detach().cpu().numpy():
        if detected['scores'][idx] > score_thresh:
            valid_boxes['boxes'].append(detected['boxes'][idx].unsqueeze(0))
            valid_boxes['labels'].append(detected['labels'][idx].unsqueeze(0))
            valid_boxes['scores'].append(detected['scores'][idx].unsqueeze(0))
        else:
            break
    if len(valid_boxes['boxes'])>0:
        valid_boxes['boxes'] = torch.cat(valid_boxes['boxes'], dim=0).to(device)
        valid_boxes['labels'] = torch.cat(valid_boxes['labels'], dim=0).to(device)
        valid_boxes['scores'] = torch.cat(valid_boxes['scores'], dim=0).to(device)
    else:
        valid_boxes['boxes'] = torch.tensor([[0, 0, CFG.img_size[0]-1, CFG.img_size[1]-1]], dtype=torch.float).to(device)
        valid_boxes['labels'] = torch.tensor([-1]).to(device)
        valid_boxes['scores'] = torch.tensor([0]).to(device)

    return valid_boxes

In [9]:
def process_yolo_format(detections):
    out = []
    for i in range(len(detections)):
        list_boxes = detections.xyxy[i]
        boxes = {'boxes':[], 'labels':[], 'scores':[]}
        for j in range(list_boxes.shape[0]):
            boxes['boxes'].append(list_boxes[j,:4])
            boxes['labels'].append(list_boxes[j,-2].int())
            boxes['scores'].append(list_boxes[j,-1])

        if len(boxes['boxes'])>0:
            boxes['boxes'] = torch.stack(boxes['boxes'], dim=0).to(device)
            boxes['labels'] = torch.stack(boxes['labels'], dim=0).to(device)
            boxes['scores'] = torch.stack(boxes['scores'], dim=0).to(device)
        else:
            boxes['boxes'] = torch.tensor([[0, 0, CFG.img_size[0]-1, CFG.img_size[1]-1]], dtype=torch.float).to(device)
            boxes['labels'] = torch.tensor([-1]).to(device)
            boxes['scores'] = torch.tensor([0]).to(device)    
        out.append(boxes)
    return out

class ObjectFeatureExtract(nn.Module):
    '''
        Args:
            img(Tensor[B, C, H, W]): Input images in a batch with B elements. 
                Each element contains C channels of dimensions H x W
            names: name of the images
            
        Returns:
            List[B]: Detection list for each batch. Each element is a dict:
                boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, 
                    with ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
                labels (Int64Tensor[N]): the predicted labels for each image
                scores (Tensor[N]): the scores or each prediction
                embedding(Tensor[N, E, H_out, W_out]): feature embeddings for detected boxes
                    N: number of boxes detected
                    E: feature size from model_emb
                    H_out, W_out: output size of ROI-Align layer
    '''
    def __init__(self):
        super().__init__()
        self.model_detection = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
        self.model_emb = timm.create_model('efficientnet_b4', pretrained=True)
        self.roi_align = torchvision.ops.RoIAlign(output_size=(3,3), 
                                                  spatial_scale=1.0/32, 
                                                  sampling_ratio=-1,
                                                  aligned=True)
    
    def forward(self, x, names):
        detected = self.model_detection(x)
        feature_map = self.model_emb.forward_features(x)
        detected = [process_bbox(detected[i], iou_thresh=0.4, score_thresh=0.3) for i in range(len(detected))]

        objects_emb = self.roi_align(feature_map, [detected[i]['boxes'] for i in range(len(detected))])

        running_count = 0
        for batch, name in zip(detected, names):
            batch['name'] = name
            batch['embedding'] = torch.narrow(objects_emb, 0, running_count, batch['boxes'].shape[0]).to(device)
            running_count += batch['boxes'].shape[0]

        return detected

class ObjectFeatureExtractYOLO(nn.Module):
    '''
        Args:
            img(Tensor[B, C, H, W]): Input images in a batch with B elements. 
                Each element contains C channels of dimensions H x W
            
            names: name of the images
            
        Returns:
            List[B]: Detection list for each batch. Each element is a dict:
                boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, 
                    with ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
                labels (Int64Tensor[N]): the predicted labels for each image
                scores (Tensor[N]): the scores or each prediction
                embedding(Tensor[N, E, H_out, W_out]): feature embeddings for detected boxes
                    N: number of boxes detected
                    E: feature size from model_emb
                    H_out, W_out: output size of ROI-Align layer
    '''
    def __init__(self):
        super().__init__()
        self.model_detection = torch.hub.load('ultralytics/yolov5', 'yolov5l6')
        self.model_emb = timm.create_model('efficientnet_b4', pretrained=True)
        self.roi_align = torchvision.ops.RoIAlign(output_size=(7,7), 
                                                  spatial_scale=1.0/32, 
                                                  sampling_ratio=-1,
                                                  aligned=True)
    
    def forward(self, x, img_raw, names):
        detected = self.model_detection(img_raw)
        feature_map = self.model_emb.forward_features(x)
        # detected = [process_bbox(detected[i], iou_thresh=0.4, score_thresh=0.3) for i in range(len(detected))]
        detected = process_yolo_format(detected)
        objects_emb = self.roi_align(feature_map, [detected[i]['boxes'] for i in range(len(detected))])

        running_count = 0
        for batch, name in zip(detected, names):
            batch['name'] = name
            batch['embedding'] = torch.narrow(objects_emb, 0, running_count, batch['boxes'].shape[0]).to(device)
            running_count += batch['boxes'].shape[0]

        return detected

In [10]:
feature_model = ObjectFeatureExtractYOLO()
feature_model = feature_model.to(device)

Downloading: "https://github.com/ultralytics/yolov5/archive/master.zip" to /root/.cache/torch/hub/master.zip


[31m[1mrequirements:[0m PyYAML>=5.3.1 not found and is required by YOLOv5, attempting auto-update...


YOLOv5 🚀 2021-8-19 torch 1.9.0+cu102 CUDA:0 (Tesla K80, 11441.1875MB)



Collecting PyYAML>=5.3.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
Installing collected packages: PyYAML
  Attempting uninstall: PyYAML
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed PyYAML-5.4.1

[31m[1mrequirements:[0m 1 package updated per /root/.cache/torch/hub/ultralytics_yolov5_master/requirements.txt
[31m[1mrequirements:[0m ⚠️ [1mRestart runtime or rerun command for updates to take effect[0m

Downloading https://github.com/ultralytics/yolov5/releases/download/v5.0/yolov5l6.pt to /root/.cache/torch/hub/ultralytics_yolov5_master/yolov5l6.pt...


  0%|          | 0.00/148M [00:00<?, ?B/s]




Fusing layers... 
Model Summary: 501 layers, 77218620 parameters, 0 gradients
Adding AutoShape... 
Loading pretrained weights from url (https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b4_ra2_320-7eb33cd5.pth)
Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b4_ra2_320-7eb33cd5.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b4_ra2_320-7eb33cd5.pth


In [11]:
%%capture
feature_model.eval()

In [12]:
# imgs = [url_to_image('http://image.vietnamnews.vn/uploadvnnews/Storage/Images/2015/10/7/21a.jpg'), 
#         url_to_image('https://media-cdn.tripadvisor.com/media/photo-s/15/26/b8/f3/generala-drapsina-street.jpg')]

# img_tensor = torch.stack([torch.tensor(im/255, dtype=torch.float) for im in imgs])
# img_tensor = img_tensor.permute(0, 3, 1, 2)
# img_tensor.shape # B x C x H x W
# detected = feature_model(img_tensor.to(device), imgs, ['a','b'])

# Prepare Data

In [13]:
from torch.utils.data import Dataset, DataLoader

class ImageDataset(Dataset):
    def __init__(self, root_dir):
        self.root_dir = root_dir
        self.image_paths = sorted(os.listdir(root_dir))

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        path = self.image_paths[idx]
        image = read_img(os.path.join(self.root_dir, path), size=CFG.img_size)
        image_t = torch.tensor(image/255, dtype=torch.float).permute(2, 0, 1)
        sample={
            "image_raw": image,
            "image": image_t,
            "name": path.split('.')[0]
            }
        return sample

class Collate:
  def __call__(self, batch):
    images = [item["image"].unsqueeze(0) for item in batch]
    images = torch.cat(images, dim=0) # N
    images_raw = [item["image_raw"] for item in batch]
    names = [item["name"] for item in batch] 
    return images, images_raw, names

In [14]:
ref_ds = ImageDataset("reference_images")
query_ds = ImageDataset("query_images")
ref_dl = DataLoader(ref_ds, batch_size=4, shuffle=False, collate_fn=Collate(), num_workers=2)
query_dl = DataLoader(query_ds, batch_size=4, shuffle=False, collate_fn=Collate(), num_workers=2)

In [15]:
# for batch_idx, (image, name) in enumerate(ref_dl):
#     print(image.shape)
#     print(name)
#     break

# Build database

In [17]:
database = []
with torch.no_grad():
    for batch_idx, (images, images_raw, names) in enumerate(tqdm(ref_dl)):
        images = images.to(device)
        detected = feature_model(images, images_raw, names)
        database.extend(detected)

100%|██████████| 124/124 [02:28<00:00,  1.20s/it]


# Inference

In [18]:
queries = []
with torch.no_grad():
    for batch_idx, (images, images_raw, names) in enumerate(tqdm(query_dl)):
        images = images.to(device)
        detected = feature_model(images, images_raw, names)
        queries.extend(detected)

100%|██████████| 124/124 [02:29<00:00,  1.20s/it]


In [19]:
cos = nn.CosineSimilarity(dim=0)

def compute_max_cosine_dist(query, ref):
    max_score = float('-inf')
    for i in range(len(query['scores'])):
        for j in range(len(ref['scores'])):
            emb_1 = torch.flatten(query['embedding'][i,])
            emb_2 = torch.flatten(ref['embedding'][j,])
            score = cos(emb_1, emb_2)
            if score > max_score:
                max_score = score
    return max_score

In [20]:
result = {"query_id":[], "reference_id":[], "score":[]}
thresh = 0.0
for query in tqdm(queries):
    max_score = float('-inf')
    max_ref = None
    for ref in database:
        dist = compute_max_cosine_dist(query, ref)
        if dist > max_score:
            max_ref = ref
            max_score = dist

    result["query_id"].append(query['name'])
    if max_score>thresh:
        result["reference_id"].append(max_ref['name'])
        result["score"].append(max_score.detach().cpu().numpy())
    else:
        result["reference_id"].append(None)
        result["score"].append(None)
    

100%|██████████| 496/496 [04:12<00:00,  1.97it/s]


In [21]:
res_df = pd.DataFrame.from_dict(result)
res_df.sort_values(by=['query_id'])

Unnamed: 0,query_id,reference_id,score
0,Q00029,R449924,0.8050856
1,Q00037,R426084,0.5255693
2,Q00071,R460297,0.37858889
3,Q00096,R402336,0.39322084
4,Q00117,R414344,0.45486158
...,...,...,...
491,Q24897,R475759,0.9318329
492,Q24929,R449924,0.82845014
493,Q24938,R403105,0.24349506
494,Q24985,R426084,0.6379663


In [22]:
compare = res_df.merge(df, how='left', on='query_id').drop(columns=["img_folder"])
compare.head(10)

Unnamed: 0,query_id,reference_id_x,score,reference_id_y
0,Q00029,R449924,0.8050856,R487691
1,Q00037,R426084,0.5255693,R450692
2,Q00071,R460297,0.37858889,R494641
3,Q00096,R402336,0.39322084,R402336
4,Q00117,R414344,0.45486158,R414344
5,Q00185,R449924,0.9422654,R472314
6,Q00220,R409236,0.85470927,R409236
7,Q00274,R449924,0.7934424,R405525
8,Q00294,R449924,0.93569446,R462833
9,Q00369,R426084,0.60254717,R419725


In [23]:
compare.to_csv("/content/drive/MyDrive/Viettel DTalent/ViettelTalentsPhase2/Phan Nhật Minh/toy_ds_result_yolo_3x3.csv", index=False)

# Stats

In [24]:
print(compare.shape[0])

496


In [25]:
compare[compare['reference_id_x']==compare['reference_id_y']].sort_values(by="score")

Unnamed: 0,query_id,reference_id_x,score,reference_id_y
23,Q00917,R490086,0.17640966,R490086
44,Q01896,R430516,0.23727703,R430516
299,Q14665,R412010,0.26245463,R412010
225,Q11322,R477836,0.28722432,R477836
127,Q06245,R412892,0.29695788,R412892
...,...,...,...,...
42,Q01849,R494882,0.98508084,R494882
118,Q05789,R425907,0.98669493,R425907
342,Q16658,R431230,0.99186563,R431230
113,Q05628,R470146,0.99189997,R470146


In [26]:
n_correct = compare[compare['reference_id_x']==compare['reference_id_y']].shape[0]
print(n_correct)

157


In [27]:
precision = n_correct/compare.shape[0]
print(precision)

0.3165322580645161


In [28]:
compare['reference_id_x'].value_counts()[:10]

R449924    158
R426084     44
R459491     16
R490797     16
R490086      6
R469218      6
R404216      5
R458376      4
R403105      4
R449418      3
Name: reference_id_x, dtype: int64

In [29]:
def get_no_bbox_count(imgs_dict):
    count = 0
    for item in imgs_dict:
        reference_array = np.array([[0,0,CFG.img_size[0]-1, CFG.img_size[1]-1]])
        if np.array_equal(item['boxes'].detach().cpu().numpy(), reference_array):
            count += 1
    return count
print("Query: {}".format(get_no_bbox_count(queries)))
print("Reference: {}".format(get_no_bbox_count(database)))

Query: 276
Reference: 199


In [30]:
incorrect = compare[compare['reference_id_x']!=compare['reference_id_y']]
incorrect

Unnamed: 0,query_id,reference_id_x,score,reference_id_y
0,Q00029,R449924,0.8050856,R487691
1,Q00037,R426084,0.5255693,R450692
2,Q00071,R460297,0.37858889,R494641
5,Q00185,R449924,0.9422654,R472314
7,Q00274,R449924,0.7934424,R405525
...,...,...,...,...
490,Q24890,R449924,0.76888067,R496878
492,Q24929,R449924,0.82845014,R460297
493,Q24938,R403105,0.24349506,R473556
494,Q24985,R426084,0.6379663,R430471


In [31]:
def find_ref_idx(reference_id):
    for idx, ref in enumerate(database):
        if ref['name']==reference_id:
            return idx

In [32]:
reference_array = np.array([[0,0,CFG.img_size[0]-1, CFG.img_size[1]-1]])
count_both = 0
count_query = 0
count_ref = 0
count_other = 0
for idx, row in incorrect.iterrows():
    query_boxes = queries[idx]['boxes'].detach().cpu().numpy()
    ref_boxes = database[find_ref_idx(row['reference_id_x'])]['boxes'].detach().cpu().numpy()
    if np.array_equal(query_boxes, reference_array) and np.array_equal(ref_boxes, reference_array):
        count_both += 1
    elif np.array_equal(query_boxes, reference_array):
        count_query += 1
    elif  np.array_equal(ref_boxes, reference_array):
        count_ref += 1
    else:
        count_other += 1

print(f'Wrong count both query and ref img have no box: {count_both}')
print(f'Wrong count query has no box: {count_query}')
print(f'Wrong count ref has no box: {count_ref}')
print(f'Wrong count both query and ref img have at least 1 box: {count_other}')

Wrong count both query and ref img have no box: 216
Wrong count query has no box: 0
Wrong count ref has no box: 13
Wrong count both query and ref img have at least 1 box: 110


# Check

In [33]:
def show_image_pairs(query, ref):
    imgs = [read_img(f'query_images/{query}.jpg'), 
            read_img(f'reference_images/{ref}.jpg')]
    _, axs = plt.subplots(1, 2, figsize=(12, 12))
    axs = axs.flatten()
    for im, ax in zip(imgs, axs):
        ax.imshow(im)
    plt.show()

In [34]:
database[find_ref_idx("R430516")]['boxes']
database[find_ref_idx("R430516")]['embedding'].shape

torch.Size([1, 1792, 7, 7])

In [35]:
# Correct pair low score
show_image_pairs('Q00917', 'R490086')
print(database[find_ref_idx("R490086")]['boxes'])

tensor([[1.64200e+02, 1.74400e+02, 8.48000e+02, 1.02000e+03],
        [7.64800e+02, 8.38400e+02, 8.65600e+02, 1.02400e+03],
        [1.70000e+00, 8.56800e+02, 2.86400e+02, 1.02320e+03],
        [5.73600e+02, 1.00000e+00, 1.02400e+03, 3.27200e+02],
        [1.05000e+00, 6.11200e+02, 1.54200e+02, 8.98400e+02],
        [3.50000e+00, 2.60000e+00, 3.44800e+02, 4.62400e+02]], device='cuda:0')


In [36]:
# Correct pair high score
show_image_pairs('Q14815', 'R400056')
print(database[find_ref_idx("R400056")]['boxes'])

tensor([[   0.,    0., 1023., 1023.]], device='cuda:0')


In [37]:
# Wrong pair
show_image_pairs('Q00294', 'R449924')
print(database[find_ref_idx("R449924")]['boxes'])

tensor([[   0.,    0., 1023., 1023.]], device='cuda:0')
