In [1]:
import configparser
import os
from collections import Counter
import inflect
import json

In [2]:
cp = configparser.ConfigParser()
cp.read('config.ini')
processed_data_path = cp['DATA_PATH']['processed_data_path']
output_file_path = os.path.join(processed_data_path, 'lsc2021-metadata.json')
p = inflect.engine()

In [3]:
# Get all pre-processed data path
white_list_id_path = os.path.join(processed_data_path, 'white_list_image_ids.txt')
white_list_img_path = os.path.join(processed_data_path, 'white_list_image_paths.txt')
place_category_path = os.path.join(processed_data_path, 'combined_place_categories.json')
microsoft_tags_path = os.path.join(processed_data_path, 'MicrosoftTags_concepts.json')
yolo_concepts_path = os.path.join(processed_data_path, 'ScaledYOLOv4_concepts.json')
visual_genome_path = os.path.join(processed_data_path, 'VisualGenomeRes101_concepts.json')
ocr_path = os.path.join(processed_data_path, 'OCR_concepts.json')
date_time_path = os.path.join(processed_data_path, 'combined_date_time_gps_activity.json')

In [4]:
# Load all kinds of data
white_list_image_id = [line.rstrip() for line in open(white_list_id_path, 'r').readlines()]
white_list_image_path = [line.rstrip() for line in open(white_list_img_path, 'r').readlines()]
place_categories = json.load(open(place_category_path, 'r'))
microsoft_tags = json.load(open(microsoft_tags_path, 'r'))
yolo_concepts = json.load(open(yolo_concepts_path, 'r'))
visual_genomes = json.load(open(visual_genome_path, 'r'))
ocrs = json.load(open(ocr_path, 'r'))
date_times = json.load(open(date_time_path, 'r'))

In [5]:
lsc2021_metadata = []
for i, image_id in enumerate(white_list_image_id):
    try:
        image_path = white_list_image_path[i]
        place_category = place_categories[image_id]
        microsoft_tag = list(set(microsoft_tags[image_id]))
        yolo_concept = yolo_concepts[image_id]
        counter_yolo_concept = Counter(yolo_concept)
        if 'person' in counter_yolo_concept:
            cnt = counter_yolo_concept['person']
            if cnt > 1:
                cnt_word = p.number_to_words(cnt)
                yolo_concept.append(f'{cnt_word} people')
        yolo_concept = list(set(yolo_concept))
        visual_genome = list(set(visual_genomes[image_id]))
        ocr = ocrs[image_id]
        date_time = date_times[image_id]
        data = {
            '_id': image_id,
            'image_path': image_path,
            **date_time,
            'place_category': place_category,
            'microsoft_tag': microsoft_tag,
            'yolo_concept': yolo_concept,
            'visual_genome': visual_genome,
            'ocr': ocr,
        }
        lsc2021_metadata.append(data)
    except Exception as e: 
        print(image_id)


g


In [6]:
with open(output_file_path, 'w') as f:
    json.dump(lsc2021_metadata, f, indent = 4)