In [1]:
import os
import configparser
from tqdm import tqdm
import json

In [2]:
cp = configparser.ConfigParser()
cp.read('config.ini')
processed_data_path = cp['DATA_PATH']['processed_data_path']
lsc2021_final_metadata = os.path.join(processed_data_path, 'lsc2021-metadata.json')

In [3]:
lsc2021_metadata = json.load(open(lsc2021_final_metadata, 'r'))

In [18]:
def generate_dictionary_by_fields(fields, output_file_path):
    dictionary = []
    for annotation in tqdm(lsc2021_metadata):
        for field_name in fields:
            data = annotation[field_name]
            if type(data) == int: 
                dictionary.append((data, field_name))
            elif type(data) == str:
                if len(data) > 0:
                    dictionary.append((data, field_name))
            elif type(data) == list:
                if len(data) > 0:
                    dictionary += [(item, field_name) for item in data]
        dictionary += [(annotation[field_name], field_name) for field_name in fields if type(annotation[field_name]) != int and len(annotation[field_name]) > 0]
    print(dictionary[:10])
    dictionary = sorted(list(set(dictionary)), key = lambda x: (x[1], x[0]))
    with open(output_file_path, 'w') as f:
        for term in dictionary:
            print(f'{term[0]} --> {term[1]}', file = f)

---

# Generate date time dictionary

In [19]:
time_dictionary_path = os.path.join(os.getcwd(), 'dictionaries', 'time_dictionary.txt')
fields = ['date', 'local_time', 'day_of_week', 'month', 'year', 'part_of_day']
generate_dictionary_by_fields(fields, time_dictionary_path)

100%|██████████| 183261/183261 [00:00<00:00, 539844.99it/s]


[('2015-02-25', 'date'), ('05:54', 'local_time'), ('wednesday', 'day_of_week'), ('february', 'month'), ('early morning', 'part_of_day'), ('2015-02-25', 'date'), ('05:55', 'local_time'), ('wednesday', 'day_of_week'), ('february', 'month'), ('early morning', 'part_of_day')]


---

# Generate location dictionary

In [20]:
location_dictionary_path = os.path.join(os.getcwd(), 'dictionaries', 'location_dictionary.txt')
fields = ['location_name', 'location_type', 'place_category']
generate_dictionary_by_fields(fields, location_dictionary_path)

100%|██████████| 183261/183261 [00:00<00:00, 731713.96it/s]

[('home', 'location_name'), ('home', 'location_type'), ('home', 'location_name'), ('home', 'location_type'), ('home', 'location_name'), ('home', 'location_type'), (['jail cell'], 'place_category'), ('home', 'location_name'), ('home', 'location_type'), (['sky'], 'place_category')]





TypeError: unhashable type: 'list'

---

# Generate concept dictionary

In [None]:
concept_dictionary_path = os.path.join(os.getcwd(), 'dictionaries', 'concept_dictionary.txt')
fields = ['microsoft_tags', 'yolo_concept', 'visual_genome', 'place_category', 'activity_type', 'activity_name']
generate_dictionary_by_fields(fields, concept_dictionary_path)