In [1]:
import torch
import json
import pickle
import os
import numpy as np

# os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

from tqdm import tqdm
from typing import List
from PIL import Image
from pathlib import Path
import torchvision.transforms as tf
from sentence_transformers import SentenceTransformer, util

from model.densecap import densecap_resnet50_fpn, DenseCapModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 1

In [2]:
def load_model(model_config_path: Path, checkpoint_path: Path, return_features=False, box_per_img=50, verbose=False):
    with open(model_config_path, 'r') as f:
        model_args = json.load(f)

    model = densecap_resnet50_fpn(backbone_pretrained=model_args['backbone_pretrained'],
                                  return_features=return_features,
                                  feat_size=model_args['feat_size'],
                                  hidden_size=model_args['hidden_size'],
                                  max_len=model_args['max_len'],
                                  emb_size=model_args['emb_size'],
                                  rnn_num_layers=model_args['rnn_num_layers'],
                                  vocab_size=model_args['vocab_size'],
                                  fusion_type=model_args['fusion_type'],
                                  box_detections_per_img=box_per_img)

    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model'], strict=False)

    if verbose and 'results_on_val' in checkpoint.keys():
        print('[INFO]: checkpoint {} loaded'.format(checkpoint_path))
        print('[INFO]: correspond performance on val set:')
        for k, v in checkpoint['results_on_val'].items():
            if not isinstance(v, dict):
                print('        {}: {:.3f}'.format(k, v))

    return model

def get_image_paths(parent_folder: Path) -> List[str]:
    image_paths = []

    for child in parent_folder.iterdir():
        if child.is_dir():
            image_paths.extend(get_image_paths(child))
            continue
        image_paths.append(str(child))

    return image_paths


def img_to_tensor(img_list, device):
    img_tensors = []

    for img_path in img_list:
        img = Image.open(img_path).convert("RGB")
        img_tensors.append(tf.ToTensor()(img).to(device))

    return img_tensors


def describe_images(model: DenseCapModel, img_list: List[str], device: torch.device):
    all_results = []

    with torch.no_grad():
        model.to(device)
        model.eval()

        for i in tqdm(range(0, len(img_list), BATCH_SIZE)):
            image_tensors = img_to_tensor(img_list[i:i+BATCH_SIZE], device=device)

            results = model(image_tensors)

            all_results.extend([{k:v.cpu() for k,v in r.items()} for r in results])

    return all_results

In [4]:
lut_path = Path("./data/VG-regions-dicts-lite.pkl")

with open(lut_path, 'rb') as f:
    look_up_tables = pickle.load(f)

idx_to_token = look_up_tables['idx_to_token']
token_to_idx = look_up_tables['token_to_idx']

params_path = Path("compute_model_params")
model_name = "without_aux"
model = load_model(
    params_path / model_name / "config.json", 
    params_path / (model_name + ".pth.tar"), 
    return_features=False, verbose=True)

img_paths = get_image_paths(Path("../3-dance/data/car_images_model_sort/"))

# === inference ====
all_results = describe_images(model, img_paths[:4000], device)

[INFO]: checkpoint compute_model_params/without_aux.pth.tar loaded
[INFO]: correspond performance on val set:
        map: 0.108
        detmap: 0.264


100%|██████████| 4000/4000 [19:27<00:00,  3.43it/s]


In [4]:
def postprocess_results(results, img_paths: List[str], idx_to_token):
    results_dict = {}

    for img_path, result in zip(img_paths, results):
        results_dict[img_path] = []

        for box, cap, score in zip(result['boxes'], result['caps'], result['scores']):
            r = {
                'box': [round(c, 2) for c in box.tolist()],
                'score': round(score.item(), 2),
                'cap': ' '.join(idx_to_token[idx] for idx in cap.tolist()
                                if idx_to_token[idx] not in ['<pad>', '<bos>', '<eos>'])
            }            

            results_dict[img_path].append(r)


    return results_dict

decoded_results = postprocess_results(all_results, img_paths, idx_to_token)

In [5]:
from typing import Dict


SCORE_THRESHOLD = 0.6

GT_EMBEDDINGS = {
    "FRONT_RIGHT": ["A car.", "The front of the car.", "The right side of the car."],
    "FRONT_LEFT": ["A car.", "The front of the car.", "The left side of the car."],
    "BACK_RIGHT": ["A car.", "The back of the car.", "The right side of the car."],
    "BACK_LEFT": ["A car.", "The back of the car.", "The left side of the car."],
}

def get_gt_embeddings(embedding_map: Dict[str, torch.Tensor], img_path: str) -> torch.Tensor:
    if "FRONT_RIGHT" in img_path:
        return embedding_map["FRONT_RIGHT"]
    elif "FRONT_LEFT" in img_path:
        return embedding_map["FRONT_LEFT"]
    elif "BACK_RIGHT" in img_path:
        return embedding_map["BACK_RIGHT"]
    elif "BACK_LEFT" in img_path:
        return embedding_map["BACK_LEFT"]
    else:
        raise Exception


def filter_results(result_dict: dict):    
    filtered_dict = {}
    # https://www.sbert.net/
    sbert = SentenceTransformer('all-MiniLM-L6-v2')
    embedding_map = {
        k: sbert.encode(v, convert_to_tensor=True) for k,v in GT_EMBEDDINGS.items()
    }    
        
    for img, results in result_dict.items():
        accepted_regions = []
        captions = []

        gt_embeddings = get_gt_embeddings(embedding_map, img)

        for region in results:
            captions.append(region['cap'])            

        cap_embedding: torch.Tensor = sbert.encode(captions, convert_to_tensor=True)
        cosine_scores = util.cos_sim(gt_embeddings, cap_embedding).cpu()
        cosine_scores = np.array([1, 0.25, 0.25]).dot(cosine_scores)        
        sort_index = np.argsort(cosine_scores)

        for idx in sort_index:
            score = cosine_scores[idx]
            cap = captions[idx]
            if score < SCORE_THRESHOLD:
                continue
            # print(f"{cap}: {score}")
            accepted_regions.append(results[idx])
        filtered_dict[img] = accepted_regions

    keys = list(filtered_dict.keys())
    for key in keys:
        if len(filtered_dict[key]) < 1:
            del filtered_dict[key]

    return filtered_dict

filtered_results = filter_results(decoded_results)

In [6]:
from copy import deepcopy
import string


def words_preprocess(phrase):
    """ preprocess a sentence: lowercase, clean up weird chars, remove punctuation """
    translator = str.maketrans('', '', string.punctuation)
    replacements = {
        u'½': u'half',
        u'—': u'-',
        u'™': u'',
        u'¢': u'cent',
        u'ç': u'c',
        u'û': u'u',
        u'é': u'e',
        u'°': u' degree',
        u'è': u'e',
        u'…': u'',
    }

    for k, v in replacements.items():
        phrase = phrase.replace(k, v)
    return str(phrase).lower().translate(translator).split()


def encode_caption(tokens, token_to_idx, max_token_length=15):
    encoded = np.ones(max_token_length+2, dtype=np.int64) * token_to_idx['<pad>']
    encoded[0] = token_to_idx['<bos>']
    encoded[len(tokens)+1] = token_to_idx['<eos>']

    for i, token in enumerate(tokens):

        if token in token_to_idx:
            encoded[i+1] = token_to_idx[token]
        else:
            encoded[i+1] = token_to_idx['<unk>']

    return encoded


encoded_results = {}
for img in filtered_results:
    encoded_results[img] = []
    for region in filtered_results[img]:
        region = deepcopy(region)
        caption = words_preprocess(region['cap'])        
        region['cap'] = encode_caption(caption, token_to_idx)
        encoded_results[img].append(region)

with open("filtered_car_data.pkl", "wb") as file:
    pickle.dump(encoded_results, file)

In [40]:
with open("filtered_car_data.pkl", "rb") as file:
    encoded_results = pickle.load(file)

dict_keys(['../3-dance/data/car_images_model_sort/3/FRONT_LEFT/15_-0.572_0.347_-1.092_1687447750165.png', '../3-dance/data/car_images_model_sort/3/FRONT_LEFT/7_-0.993_0.230_-0.020_1687447750032.png', '../3-dance/data/car_images_model_sort/3/FRONT_LEFT/14_-0.508_0.224_-0.582_1687447750149.png', '../3-dance/data/car_images_model_sort/3/FRONT_LEFT/12_-0.403_0.062_-1.256_1687447750116.png'])


In [9]:
from dataset import DenseCapDataset
from utils.filtered_car_data_loader import FilteredCarClassImageDataset
from torch.utils.data import DataLoader

BATCH_SIZE = 4
CAP_LR = 1e-3
LR = 1e-4
WEIGHT_DECAY = 0


params_path = Path("compute_model_params")
model_name = "without_aux"
model = load_model(
    params_path / model_name / "config.json", 
    params_path / (model_name + ".pth.tar"), 
    return_features=False)

dataset = FilteredCarClassImageDataset("filtered_car_data.pkl")
data_loader = DataLoader(dataset, batch_size=BATCH_SIZE, collate_fn=DenseCapDataset.collate_fn)

model.to(device)
optimizer = torch.optim.Adam([{'params': (para for name, para in model.named_parameters()
                                              if para.requires_grad and 'box_describer' not in name)},
                                  {'params': (para for para in model.roi_heads.box_describer.parameters()
                                              if para.requires_grad), 'lr': CAP_LR}],
                                 lr=LR, weight_decay=WEIGHT_DECAY)

for epoch in range(10):
    for img, targets in data_loader:
        img = [img_tensor.to(device) for img_tensor in img]
        targets = [{k: v.to(device) for k, v in target.items()} for target in targets]

        model.train()

        losses = model(img, targets)

        detect_loss = losses['loss_objectness'] + losses['loss_rpn_box_reg'] + \
                        losses['loss_classifier'] + losses['loss_box_reg']
        caption_loss = losses['loss_caption']

        total_loss = 1.0 * detect_loss + 1.0 * caption_loss        

        optimizer.zero_grad()
        total_loss.backward()
        optimizer.step()

tensor(2.0050, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.7942, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.9281, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.5365, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.5921, device='cuda:0', grad_fn=<AddBackward0>)
tensor(1.5513, device='cuda:0', grad_fn=<AddBackward0>)


OutOfMemoryError: CUDA out of memory. Tried to allocate 130.00 MiB (GPU 0; 5.93 GiB total capacity; 4.31 GiB already allocated; 135.00 MiB free; 4.44 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF