# Imports

In [1]:
import numpy as np

In [2]:
import os
import os.path as osp
from glob import glob

In [3]:
from PIL import Image
from copy import deepcopy
from sacred import Experiment

In [4]:
from utils import pickle_load, pickle_save, json_save, ReadSolution

In [5]:
from copy import deepcopy
from functools import partial
from pprint import pprint
import os.path as osp

In [6]:
def extract_resolution(data_dir, records, gnd=None, split_char=','):
    outs = []
    for i in range(len(records)):
        entry = records[i]
        name, label = entry.split(split_char)
        path = osp.join(data_dir, name)
        if gnd is not None:
            bbx = gnd['gnd'][i]['bbx']
            width  = int(bbx[2] - bbx[0] + 1)
            height = int(bbx[3] - bbx[1] + 1)
        else:
            try:
                img = Image.open(path)
            except Warning:
                print('corrupted image:', i, name)
            width, height = img.size
        line = split_char.join([name, label, str(width), str(height)])
        outs.append(line)
        if i % 1000 == 0:
            print(i)
    return outs

In [7]:
ex8 = Experiment('Prepare ViQuAE For Training RRT', interactive=True)

In [8]:
def read_file(filename):
    with open(filename) as f:
        lines = f.read().splitlines()
    return lines

In [9]:
dataset_name = 'viquae_for_rrt'
data_dir = osp.join('/mnt/beegfs/home/smessoud/RerankingTransformer/models/research/delf/delf/python/delg/data', dataset_name)

In [10]:
feature_name = 'r50_gldv1'
set_name = 'tuto'
gnd_name = 'training_gnd_'+ set_name+'.pkl'
origin_gnd_name = 'gnd_'+ set_name+'.pkl'

In [11]:
gnd_data = pickle_load(osp.join(data_dir, gnd_name))

In [12]:
np.array(gnd_data['simlist']).shape #, s_categories.reshape(np.array(gnd_data['simlist']).shape).shape

(1071, 100)

In [13]:
def map_nnids_labels(data_dir, gnd_file, s_categories):
    gnd = pickle_load(gnd_file)
    selection_gallery = gnd['simlist']
    s_categories = s_categories.reshape(np.array(selection_gallery).shape)
    selection_ids_to_cat_dict = [{k: s_categories[i][k] for k in range(len(selection_gallery[i]))} for i in range(len(selection_gallery))]
    
    return selection_ids_to_cat_dict

In [14]:
origin_gnd_data = pickle_load(osp.join(data_dir, origin_gnd_name))
origin_gnd_gnd = origin_gnd_data['gnd']
checksum = np.sum([1 if (len(origin_gnd_gnd[i]['junk']) == 100) else len(origin_gnd_gnd[i]['hard'])+1 for i in range(len(origin_gnd_gnd))])
checksum

1071

In [15]:
np.sum([1 if (len(origin_gnd_gnd[i]['junk']) == 100) or (len(origin_gnd_gnd[i]['hard']) == 0)  else 0 for i in range(len(origin_gnd_gnd))])

24

In [37]:
selection_ids_to_cat_dict = map_nnids_labels(data_dir, osp.join(data_dir, gnd_name), s_categories)
np.array(selection_ids_to_cat_dict).shape

(1071,)

In [38]:
gnd_data['gnd'][1]['hard'], 
selection_ids_to_cat_dict[1][16], 

(1,)

In [17]:
import random
random.seed(42)

In [18]:
';;'.join([osp.join('jpg', gnd_data['simlist'][0][0]), str(0)])

'jpg/512px-Naples_Players_Performance_of_"42nd_Street"_in_Blackburn_Hall,_July_2011..jpg;;0'

In [19]:
gnd_data.keys()

dict_keys(['imlist', 'qimlist', 'simlist', 'gnd'])

In [20]:
len(gnd_data['qimlist'])

1071

In [21]:
def prepare_gnd_for_rrt_training(data_dir, gnd_file):
    count = 0
    
    gnd = pickle_load(gnd_file)
    query_names     = gnd['qimlist']
    gallery_names   = gnd['imlist']
    selection_gallery = gnd['simlist']
    gnd_gnd = gnd['gnd']
    
    new_gnd = {}
    new_gnd['imlist']  = gallery_names
    new_gnd['qimlist'] = []
    new_gnd['simlist'] = []
    new_gnd['gnd'] = []
    
    categories = []
    
    for i in range(len(query_names)):
        
        new_gnd['qimlist'].append(query_names[i])
        new_gnd['simlist'].append(selection_gallery[i])
        new_gnd['gnd'].append(gnd_gnd[i])
        categories.append(i)
        count += 1
        
        if len(gnd_data['gnd'][i]['junk']) == 100:
            continue
                
        for j in gnd_gnd[i]['hard']:
            new_query = selection_gallery[i][j]
            new_gnd['qimlist'].append(new_query)
            categories.append(i)
            
            new_simlist = selection_gallery[i]
            new_simlist[j] = new_query
            new_gnd['simlist'].append(new_simlist)
            
            new_gnd['gnd'].append(gnd_gnd[i])
            
            count += 1
    
    return new_gnd, count, categories

In [22]:
new_gnd, count, categories = prepare_gnd_for_rrt_training(data_dir, osp.join(data_dir, gnd_name))

In [23]:
len(new_gnd['qimlist']), count

(28939, 28939)

In [24]:
gnd_name = 'training_gnd_'+ set_name+'.pkl'

In [25]:
#pickle_save(osp.join(data_dir, gnd_name), new_gnd)

In [26]:
len(gnd_data['qimlist'])*100

107100

In [27]:
len(random.sample(range(len(gnd_data['qimlist']), 5000000), len(gnd_data['qimlist'])*100))

107100

In [28]:
def load_viquae_rrt_training(data_dir, gnd_file, categories):
    prefix = 'jpg'
    gnd = pickle_load(gnd_file)
    query_names     = gnd['qimlist']
    gallery_names   = gnd['imlist']
    selection_gallery = gnd['simlist']
    
    random.seed(42)
    s_cat_list = random.sample(range(len(gnd['qimlist']), 5000000), len(gnd['qimlist'])*100)
    
    outs = []
    q_categories = []
    s_categories = []
    for i in range(len(query_names)):
        q_cat = categories[i]
        q_categories.append(q_cat)
        outs.append(';;'.join([osp.join(prefix, query_names[i]), str(q_cat)]))
        for j in range(100):
            if j in gnd['gnd'][i]['hard']:
                s_categories.append(i)
                outs.append(';;'.join([osp.join(prefix, selection_gallery[i][j]), str(q_cat)]))
            else:
                cat = s_cat_list.pop(0)
                s_categories.append(cat)
                outs.append(';;'.join([osp.join(prefix, selection_gallery[i][j]), str(cat)]))
    
    return outs, np.stack(q_categories, axis=0), np.stack(s_categories, axis=0)

In [29]:
train, q_categories, s_categories = load_viquae_rrt_training(data_dir, osp.join(data_dir, gnd_name), categories)

In [30]:
q_categories.shape, s_categories.shape, len(train), 125700+1257

((1071,), (107100,), 108171, 126957)

In [31]:
s_categories.dtype

dtype('int64')

In [32]:
np.savetxt(data_dir+'/tuto_s_categories.txt', s_categories, fmt='%i')

In [33]:
ss_categories = np.loadtxt(data_dir+'/tuto_s_categories.txt', dtype='int64')
ss_categories.dtype

dtype('int64')

In [34]:
j = 44
train[1:101][j], gnd_data['simlist'][0][j], s_categories[j]

('jpg/512px-Becky_Gulsvig.jpg;;1048188', '512px-Becky_Gulsvig.jpg', 1048188)

In [35]:
@ex8.config
def config():
    data = '/mnt/beegfs/home/smessoud/RerankingTransformer/models/research/delf/delf/python/delg/data'
    data_dir   = osp.join(data, 'viquae_for_rrt')
    #data_dir   = osp.join('data', 'viquae_for_rrt')
    train_file  = 'tuto.txt'
    gnd_file   = 'gnd_tuto.pkl'
    require_resolution = True

@ex8.main
def generate_train_viquae(data_dir, train_file, gnd_file, require_resolution):
    train_file  = osp.join(data_dir, train_file)
    gnd_file   = osp.join(data_dir, gnd_file)
    train, _, _ = load_viquae_rrt_training(data_dir, gnd_file, categories)
    gnd = pickle_load(gnd_file)

    if require_resolution:
        train  = extract_resolution(data_dir, train, split_char=';;')

    with open(train_file, 'w') as f:
        f.write('\n'.join(train))

In [36]:
ex8.run()

INFO - Prepare ViQuAE For Training RRT - Running command 'generate_train_viquae'
INFO - Prepare ViQuAE For Training RRT - Started


0




KeyboardInterrupt: 

In [39]:
import sacred
import torch
import torch.nn as nn
from sacred import SETTINGS
from sacred.utils import apply_backspaces_and_linefeeds
from torch.backends import cudnn

In [40]:
from torch.utils.data import DataLoader, RandomSampler, BatchSampler
from typing import NamedTuple, Optional

In [41]:
from models.matcher import MatchERT
from sacred import Ingredient

In [42]:
from models.ingredient import model_ingredient, get_model
from utils.data.dataset_ingredient import data_ingredient, get_loaders
from utils.data.dataset import FeatureDataset
# from utils.training import evaluate_time as evaluate
from utils.training import evaluate
ex = sacred.Experiment('RRT Evaluation', ingredients=[data_ingredient, model_ingredient], interactive=True)



In [43]:
use_aqe = False
aqe_params = {'k': 2, 'alpha': 0.3}

save_nn_inds = True

In [44]:
# Filter backspaces and linefeeds
SETTINGS.CAPTURE_MODE = 'sys'
ex.captured_out_filter = apply_backspaces_and_linefeeds

In [45]:
cpu = False  # Force training on CPU
cudnn_flag = 'benchmark'
temp_dir = osp.join('logs', 'temp')
resume = None
resume = '/mnt/beegfs/home/smessoud/RerankingTransformer/RRT_GLD/rrt_gld_ckpts/r50_gldv1.pt'
seed = 0

In [46]:
device = torch.device('cuda:0' if torch.cuda.is_available() and not cpu else 'cpu')
torch.manual_seed(seed)

<torch._C.Generator at 0x7f098356a550>

In [47]:
gnd_data.keys(), np.stack(gnd_data['simlist'], axis=0)[0]

(dict_keys(['imlist', 'qimlist', 'simlist', 'gnd']),
 array(['512px-Naples_Players_Performance_of_"42nd_Street"_in_Blackburn_Hall,_July_2011..jpg',
        '512px-MatildaNightShubertsign.JPG',
        '512px-MatildaNightShubertsign.JPG', '512px-Chicago3_(MdB).jpg',
        '512px-Chicago3_(MdB).jpg', '512px-Helen_Westley.jpg',
        '512px-Laurie_Beechman_1984.jpg', '512px-Laurie_Beechman_1984.jpg',
        '512px-Ernani_Libretto_1859.jpg', '512px-JLT2017.jpg',
        '512px-Andrea_McArdle_Reid_Shelton_Annie_1977.JPG',
        '512px-Hairspray_the_Musical_2012_Aberystwyth_Arts_Centre_Summer_Season_production_poster.png',
        '512px-Hairspray_the_Musical_2012_Aberystwyth_Arts_Centre_Summer_Season_production_poster.png',
        '512px-Hairspray_the_Musical_2012_Aberystwyth_Arts_Centre_Summer_Season_production_poster.png',
        '512px-Hairspray_the_Musical_2012_Aberystwyth_Arts_Centre_Summer_Season_production_poster.png',
        '512px-Hairspray_the_Musical_2012_Aberystwyth_Ar

In [48]:
np.stack(gnd_data['simlist'], axis=0).shape

(1071, 100)