## Imports

In [1]:
import json
import numpy as np
from datasets import load_from_disk, set_caching_enabled
set_caching_enabled(False)

# os.getcwd() = ~/ViQuAE
## Loading Data

In [2]:
dataset = load_from_disk("data/viquae_dataset")

In [3]:
kb = load_from_disk('data/viquae_passages/')

In [4]:
wiki = load_from_disk('data/viquae_wikipedia')

In [5]:
train_set, dev_set, test_set = dataset['train'], dataset['validation'], dataset['test']

In [6]:
humans_with_faces, humans_without_faces, non_humans = wiki['humans_with_faces'], wiki['humans_without_faces'], wiki['non_humans']

## Article to Passage Mapping

In [7]:
n_h_article2passage = None
h_w_f_article2passage = None
h_wo_f_article2passage = None

In [8]:
f = open('data/viquae_wikipedia/non_humans/article2passage.json')
n_h_article2passage = json.load(f)
f.close()                                                              

In [9]:
inv_n_h_article2passage = {} 
for k,v in n_h_article2passage.items(): 
    for x in v: 
        inv_n_h_article2passage.setdefault(x,[]).append(k) 

len_n_h = len(inv_n_h_article2passage.keys())
len_n_h

7175529

In [10]:
f = open('data/viquae_wikipedia/humans_without_faces/article2passage.json')
h_wo_f_article2passage = json.load(f)                                  
f.close()

In [11]:
inv_h_wo_f_article2passage = {} 
for k,v in h_wo_f_article2passage.items(): 
    for x in v: 
        inv_h_wo_f_article2passage.setdefault(x,[]).append(k) 

len_h_wo_f = len(inv_h_wo_f_article2passage.keys())
len_h_wo_f

298698

In [12]:
f = open('data/viquae_wikipedia/humans_with_faces/article2passage.json')                                                                      
h_w_f_article2passage = json.load(f)                                   
f.close()                  

In [13]:
inv_h_w_f_article2passage = {} 
for k,v in h_w_f_article2passage.items(): 
    for x in v: 
        inv_h_w_f_article2passage.setdefault(x,[]).append(k)

len_h_w_f = len(inv_h_w_f_article2passage.keys())
len_h_w_f

4411741

In [14]:
len_n_h + len_h_w_f + len_h_wo_f

11885968

## Some exploration

In [15]:
item = train_set[0]

In [16]:
item['id']

'a79b04bb9ea4c1e17edf31d544fd17bd'

In [17]:
item['image']

'512px-John_R._Neill_-_Les_Misérables_-_Cosette_in_front_of_the_doll_shop.jpg'

In [18]:
len(item['search_indices']), len(item['search_irrelevant_indices'])

(100, 100)

In [19]:
item_relevant_indices = list(set(item['search_indices']) - set(item['search_irrelevant_indices']))
item_relevant_indices = list(set(item['search_indices']) - set(item['search_irrelevant_indices']))
item_irrelevant_indices = item['search_irrelevant_indices']

In [20]:
item_irrelevant_indices[0]

11121680

In [21]:
len_h_w_f, len_h_wo_f, len_n_h, (len_h_w_f + len_h_wo_f + len_n_h)

(4411741, 298698, 7175529, 11885968)

In [22]:
h_w_f_passage_split = dict(zip(inv_h_w_f_article2passage.keys(), ['humans_with_faces'] * len_h_w_f))
h_wo_f_passage_split = dict(zip(inv_h_wo_f_article2passage.keys(), ['humans_without_faces'] * len_h_wo_f))
n_h_passage_split = dict(zip(inv_n_h_article2passage.keys(), ['non_humans'] * len_n_h))

passage_wiki_split = {**h_w_f_passage_split, **h_wo_f_passage_split, **n_h_passage_split}
len(passage_wiki_split)

11885968

In [23]:
inv_article2passage = {**inv_h_w_f_article2passage, **inv_h_wo_f_article2passage, **inv_n_h_article2passage}

len(inv_article2passage)

11885968

In [24]:
int(inv_article2passage[item_irrelevant_indices[0]][0])

883133

In [25]:
#n_h_ids, h_w_f_ids, h_wo_f_ids = non_humans['passage_index'], humans_with_faces['passage_index'], humans_without_faces['passage_index']

In [26]:
inv_article2passage[433]

['0']

In [27]:
# h_wo_f_ids.index('157646')

In [28]:
dataset

DatasetDict({
    train: Dataset({
        features: ['BM25_indices', 'BM25_scores', 'arcface_indices', 'arcface_scores', 'clip-RN50', 'document_arcface_indices', 'document_arcface_scores', 'document_provenance_indices', 'document_resnet_indices', 'document_resnet_scores', 'document_search_indices', 'document_search_scores', 'face', 'face_box', 'face_embedding', 'face_landmarks', 'face_prob', 'id', 'image', 'image_embedding', 'image_hash', 'input', 'keep_clip-RN50', 'keep_face_embedding', 'keep_image_embedding', 'kilt_id', 'meta', 'original_answer_provenance_indices', 'original_question', 'output', 'provenance_indices', 'resnet_indices', 'resnet_scores', 'search_indices', 'search_irrelevant_indices', 'search_provenance_indices', 'search_scores', 'semi-oracle_irrelevant_indices', 'semi-oracle_provenance_indices', 'url', 'wikidata_id'],
        num_rows: 1190
    })
    validation: Dataset({
        features: ['BM25_indices', 'BM25_scores', 'arcface_indices', 'arcface_scores', 'clip-RN50

In [29]:
train_set

Dataset({
    features: ['BM25_indices', 'BM25_scores', 'arcface_indices', 'arcface_scores', 'clip-RN50', 'document_arcface_indices', 'document_arcface_scores', 'document_provenance_indices', 'document_resnet_indices', 'document_resnet_scores', 'document_search_indices', 'document_search_scores', 'face', 'face_box', 'face_embedding', 'face_landmarks', 'face_prob', 'id', 'image', 'image_embedding', 'image_hash', 'input', 'keep_clip-RN50', 'keep_face_embedding', 'keep_image_embedding', 'kilt_id', 'meta', 'original_answer_provenance_indices', 'original_question', 'output', 'provenance_indices', 'resnet_indices', 'resnet_scores', 'search_indices', 'search_irrelevant_indices', 'search_provenance_indices', 'search_scores', 'semi-oracle_irrelevant_indices', 'semi-oracle_provenance_indices', 'url', 'wikidata_id'],
    num_rows: 1190
})

In [30]:
dev_set

Dataset({
    features: ['BM25_indices', 'BM25_scores', 'arcface_indices', 'arcface_scores', 'clip-RN50', 'clip-RN50_indices', 'clip-RN50_scores', 'document_arcface_indices', 'document_arcface_scores', 'document_provenance_indices', 'document_resnet_indices', 'document_resnet_scores', 'document_search_indices', 'document_search_scores', 'face', 'face_box', 'face_embedding', 'face_landmarks', 'face_prob', 'id', 'image', 'image_embedding', 'image_hash', 'input', 'keep_clip-RN50', 'keep_face_embedding', 'keep_image_embedding', 'kilt_id', 'meta', 'original_answer_provenance_indices', 'original_question', 'output', 'provenance_indices', 'resnet_indices', 'resnet_scores', 'search_indices', 'search_irrelevant_indices', 'search_provenance_indices', 'search_scores', 'semi-oracle_irrelevant_indices', 'semi-oracle_provenance_indices', 'url', 'wikidata_id'],
    num_rows: 1250
})

In [31]:
len(dev_set['search_indices'][10]), len(dev_set['search_irrelevant_indices'][10]),

(100, 100)

In [32]:
test_set

Dataset({
    features: ['BM25_indices', 'BM25_scores', 'arcface_indices', 'arcface_scores', 'clip-RN50', 'clip-RN50_indices', 'clip-RN50_scores', 'document_BM25_indices', 'document_BM25_scores', 'document_arcface_indices', 'document_arcface_scores', 'document_provenance_indices', 'document_resnet_indices', 'document_resnet_scores', 'document_search_indices', 'document_search_scores', 'face', 'face_box', 'face_embedding', 'face_landmarks', 'face_prob', 'id', 'image', 'image_embedding', 'image_hash', 'input', 'keep_clip-RN50', 'keep_face_embedding', 'keep_image_embedding', 'kilt_id', 'meta', 'original_answer_provenance_indices', 'original_question', 'output', 'provenance_indices', 'resnet_indices', 'resnet_scores', 'search_indices', 'search_irrelevant_indices', 'search_provenance_indices', 'search_scores', 'semi-oracle_irrelevant_indices', 'semi-oracle_provenance_indices', 'url', 'wikidata_id'],
    num_rows: 1257
})

In [33]:
wiki

DatasetDict({
    non_humans: Dataset({
        features: ['anchors', 'categories', 'clip-RN50', 'document', 'history', 'image', 'image_embedding', 'image_hash', 'kilt_id', 'passage_index', 'text', 'url', 'wikidata_info', 'wikipedia_id', 'wikipedia_title'],
        num_rows: 953379
    })
    humans_with_faces: Dataset({
        features: ['anchors', 'categories', 'clip-RN50', 'document', 'face_box', 'face_embedding', 'face_landmarks', 'face_prob', 'history', 'image', 'image_embedding', 'image_hash', 'keep_face_embedding', 'kilt_id', 'passage_index', 'text', 'url', 'wikidata_info', 'wikipedia_id', 'wikipedia_title'],
        num_rows: 506237
    })
    humans_without_faces: Dataset({
        features: ['anchors', 'categories', 'clip-RN50', 'document', 'face_embedding', 'history', 'image', 'image_embedding', 'image_hash', 'kilt_id', 'passage_index', 'text', 'url', 'wikidata_info', 'wikipedia_id', 'wikipedia_title'],
        num_rows: 35736
    })
})

In [34]:
# for every question, get the list of the top 100 search results
# create a mapping betweeen image_file_name, and image_id 
img_list = []
for item in train_set:
    item_relevant_passages = list(set(item['search_indices']) - set(item['search_irrelevant_indices']))
    
    for passage in item['search_indices']: # get the list of passages
        # for every passage, get the list its corresponding wikipedia article id and split
        wiki_id = int(inv_article2passage[passage][0])
        wiki_split = passage_wiki_split[passage]
        if wiki_split == 'humans_with_faces':
            # row_index = h_w_f_ids.index(wiki_id)
            # wiki_item = humans_with_faces[row_index]
            wiki_item = humans_with_faces[wiki_id]
        elif wiki_split == 'humans_without_faces':
            # row_index = h_wo_f_ids.index(wiki_id) 
            # wiki_item == humans_without_faces[row_index]
            wiki_item = humans_without_faces[wiki_id]
        else:
            # row_index = n_h_ids.index(wiki_id)
            # wiki_item == non_humans[row_index]
            wiki_item = non_humans[wiki_id]
        
        img_list.append(wiki_item['image'])

In [35]:
# for every question, get the list of the top 100 search results
for item in dev_set:
    item_relevant_passages = list(set(item['search_indices']) - set(item['search_irrelevant_indices']))
    
    for passage in item['search_indices']: # get the list of passages
        # for every passage, get the list its corresponding wikipedia article id and split
        wiki_id = int(inv_article2passage[passage][0])
        wiki_split = passage_wiki_split[passage]
        if wiki_split == 'humans_with_faces':
            # row_index = h_w_f_ids.index(wiki_id)
            # wiki_item = humans_with_faces[row_index]
            wiki_item = humans_with_faces[wiki_id]
        elif wiki_split == 'humans_without_faces':
            # row_index = h_wo_f_ids.index(wiki_id) 
            # wiki_item == humans_without_faces[row_index]
            wiki_item = humans_without_faces[wiki_id]
        else:
            # row_index = n_h_ids.index(wiki_id)
            # wiki_item == non_humans[row_index]
            wiki_item = non_humans[wiki_id]
            
        img_list.append(wiki_item['image'])

In [36]:
# for every question, get the list of the top 100 search results
for item in test_set:
    item_relevant_passages = list(set(item['search_indices']) - set(item['search_irrelevant_indices']))
    
    for passage in item['search_indices']: # get the list of passages
        # for every passage, get the list its corresponding wikipedia article id and split
        wiki_id = int(inv_article2passage[passage][0])
        wiki_split = passage_wiki_split[passage]
        if wiki_split == 'humans_with_faces':
            # row_index = h_w_f_ids.index(wiki_id)
            # wiki_item = humans_with_faces[row_index]
            wiki_item = humans_with_faces[wiki_id]
        elif wiki_split == 'humans_without_faces':
            # row_index = h_wo_f_ids.index(wiki_id) 
            # wiki_item == humans_without_faces[row_index]
            wiki_item = humans_without_faces[wiki_id]
        else:
            # row_index = n_h_ids.index(wiki_id)
            # wiki_item == non_humans[row_index]
            wiki_item = non_humans[wiki_id]

        img_list.append(wiki_item['image'])

In [37]:
len(img_list), len(list(set(img_list)))

(369700, 96560)

In [None]:
img_list[:5]

In [None]:
entire_dataset_img_list = img_list + train_set['image'] + dev_set['image'] + test_set['image']

In [None]:
len(entire_dataset_img_list), 1257+1250+1190+369700

In [None]:
import random
random.seed(42)

all_img_ids = random.sample(range(100000, 999999), len(list(set(entire_dataset_img_list))))

len(entire_dataset_img_list), len(all_img_ids), len(list(set(all_img_ids)))

In [None]:
all_img_ids_dict = dict(zip(list(set(entire_dataset_img_list)), all_img_ids))

In [None]:
[all_img_ids_dict[i] for i in img_list[:5]]

In [None]:
# np.savetxt('data/Commons_image_list.txt', list(set(img_list)), fmt="%s") 

In [None]:
# np.savetxt('data/entire_dataset_image_list.txt', list(set(entire_dataset_img_list)), fmt="%s")

In [None]:
test_query_img_ids = dict((k, all_img_ids_dict[k]) for k in test_set['image']) # if k in all_img_ids_dict)
train_query_img_ids = dict((k, all_img_ids_dict[k]) for k in train_set['image']) # if k in all_img_ids_dict)
dev_query_img_ids = dict((k, all_img_ids_dict[k]) for k in dev_set['image']) # if k in all_img_ids_dict)

In [None]:
#import shutil
#src_dir = 'data/Commons'
#dst_dir = 'data/local_Commons'

#for img in list(set(img_list)):
#    shutil.copyfile(src_dir + '/' + img, dst_dir + '/' + img)

In [None]:
#!tar -czvf data/local_Commons.tar.gz data/local_Commons

In [None]:
def prepare_data_for_rrt(test_set, humans_with_faces,
                         humans_without_faces,
                         non_humans,
                         inv_article2passage,
                         passage_wiki_split, 
                         return_img_list=False):
    img_list = []
    # for every question, get the list of the top 100 search results
    
    for item in test_set:
        item_relevant_passages = list(set(item['search_indices']) - set(item['search_irrelevant_indices']))

        for passage in item['search_indices']: 
            
            # for every passage, get the list its corresponding wikipedia article id and split
            wiki_id = int(inv_article2passage[passage][0])
            wiki_split = passage_wiki_split[passage]
            if wiki_split == 'humans_with_faces':
                wiki_item = humans_with_faces[wiki_id]
                
            elif wiki_split == 'humans_without_faces':
                wiki_item = humans_without_faces[wiki_id]
                
            else:
                wiki_item = non_humans[wiki_id]
            
            img_list.append(wiki_item['image'])

In [None]:
number_str = str(123)
zero_filled_number = number_str.zfill(5)
zero_filled_number

## Save json files

- n_h_article2passage -> data/viquae_wikipedia/non_humans/article2passage.json
- inv_n_h_article2passage -> SKIP
- h_wo_f_article2passage -> data/viquae_wikipedia/humans_without_faces/article2passage.json
- inv_h_wo_f_article2passage -> SKIP
- h_w_f_article2passage -> data/viquae_wikipedia/humans_with_faces/article2passage.json
- inv_h_w_f_article2passage -> SKIP
- inv_article2passage ( -> data/passage2article.json )
- h_w_f_passage_split -> SKIP
- h_wo_f_passage_split -> SKIP
- n_h_passage_split -> SKIP
- passage_wiki_split ( -> data/passage_wiki_split.json )
- img_ids ( -> data/gallery_img_ids.json )
- all_img_ids_dict ( -> data/all_img_ids.json
- test_query_img_ids ( -> test_query_img_ids.json )
- train_query_img_ids ( -> train_query_img_ids.json )
- dev_query_img_ids ( -> dev_query_img_ids.json )

In [None]:
#with open("data/passage2article.json", "w") as outfile:
#    json.dump(inv_article2passage, outfile)

In [None]:
#with open("data/passage_wiki_split.json", "w") as outfile:
#    json.dump(passage_wiki_split, outfile)

In [None]:
#with open("data/gallery_img_ids.json", "w") as outfile:
#    json.dump(img_ids, outfile)

In [None]:
#with open("data/all_img_ids.json", "w") as outfile:
#    json.dump(all_img_ids_dict, outfile)

In [None]:
#with open("data/test_query_img_ids.json", "w") as outfile:
#    json.dump(test_query_img_ids, outfile)

In [None]:
#with open("data/train_query_img_ids.json", "w") as outfile:
#    json.dump(train_query_img_ids, outfile)

In [None]:
#with open("data/dev_query_img_ids.json", "w") as outfile:
#    json.dump(dev_query_img_ids, outfile)