In [None]:
import json
import pandas as pd

def data_json_to_joined_df(path_prefix_to_json, percent = 100, random_sample=False):
    # percent: take only certain percent of the images
    # random_sample: True if you want to randomize the image
    
    category = pd.read_json(path_prefix_to_json + 'category.json').rename(columns={"name": "category_name"})
    category = category.sort_values(by='category_name')
    category['category_id'] = range(len(category))

    obj_ann = pd.read_json(path_prefix_to_json + 'object_ann.json')
    sample_data = pd.read_json(path_prefix_to_json + 'sample_data.json')
    sample_data = sample_data[sample_data['filename'].str.startswith('sample')] # take only keyframe images (annotated images) which always start with 'samples'. sweeps images are not annotated
    n_rows = int(len(sample_data) * (percent / 100))
    if random_sample:
        sample_data = sample_data.sample(n = n_rows)
    else:   
        sample_data = sample_data.iloc[:n_rows]

    # object annotation inner join sample_data
    merged = obj_ann.merge(
        sample_data,
        how = 'inner',
        right_on='token',
        left_on='sample_data_token',
        suffixes=('_left', '_right')
    )

    merged = merged.merge(
        category,
        how = 'left',
        left_on='category_token',
        right_on='token',
        suffixes=('_from_merged', '_from_category')
    )

    return merged

# create list of train/val/test image file name list in a txt file (train/val.txt), so we can use cp/rsync with it
def create_image_filename_list_txt(data_split, merged_df):
    if data_split not in ['train', 'val', 'test']:
        return -1

    df = merged_df[['filename']]
    filenames = set([])
    for index, row in df.iterrows():
        filenames.add(row['filename'])
    for fn in filenames:
        with open(data_split+".txt", 'a') as f:
            f.write(fn + "\n")


def df_to_coco_format(merged_df, path_prefix, split): # DETR uses COCO format
    if split not in ['val', 'train']:
        return -1
    
    df = merged_df[['filename', 'bbox', 'category_id', 'category_name', 'width', 'height']]

    coco_dict = {"images": [], "annotations": [], "categories": []}
    is_image_added = {}
    filename_to_id = {}

    id = 0

    # build images
    for i, row in df.iterrows():
        if row['filename'] not in is_image_added: # if image not added yet, then add 
            coco_dict["images"].append({
                "id": id,
                "width": row['width'],
                "height": row['height'],
                "file_name": row['filename'].split("/")[-1],
            })
            filename_to_id[row['filename']] = id
            id += 1
            is_image_added[row['filename']] = True

    # build annotations
    for i, row in df.iterrows():
        xmin, ymin, xmax, ymax = row['bbox'][0], row['bbox'][1], row['bbox'][2], row['bbox'][3] 
        coco_dict["annotations"].append({
            "id": i,
            "image_id": filename_to_id[row['filename']],
            "category_id": row['category_id'],
            "area": (xmax - xmin) * (ymax - ymin),
            "bbox": [xmin, ymax, xmax - xmin, ymax - ymin], # top left, width, height 
        })


    # build category
    category = pd.read_json(path_prefix + 'category.json').rename(columns={"name": "category_name"})
    category = category.sort_values(by='category_name')

    for i, row in category.iterrows():
        coco_dict["categories"].append({
            "id": i,
            "name": row["category_name"],
        })

    with open(split + ".json", "w") as file:
        file.write(json.dumps(coco_dict))


In [5]:
val = data_json_to_joined_df('/home/akbar/Downloads/nuimages-v1.0-all-metadata/v1.0-val/',1, random_sample=True)
val.head()

Unnamed: 0,token_left,category_token,bbox,mask,attribute_tokens,sample_data_token,token_right,sample_token,ego_pose_token,calibrated_sensor_token,...,width,height,timestamp,is_key_frame,prev,next,token,category_name,description,category_id
0,0000efb0e1c841d49ddfb26734c38fb1,6021b5187b924d64be64a702e5570edf,"[292, 454, 405, 508]","{'size': [900, 1600], 'counts': 'ZVNRODhkazA+Q...",[abc0f113547848a9baaa62096fca37f5],88203d2859ab4dac9870d5559a99ce9e,88203d2859ab4dac9870d5559a99ce9e,0391cfee6088407fa3bb4e416ab030f4,fd6751aa12a84b9d9ff69dbb2d9fdab3,dbff59d5c7ad575090bb13541507daae,...,1600,900,2018-08-21 06:30:19.262407,True,d20d37ba67ec4236b434417be69bb451,31f251c7970d4be38c0a5fb128219879,6021b5187b924d64be64a702e5570edf,vehicle.truck,Vehicles primarily designed to haul cargo incl...,24
1,00af54f5961d4c4fa9a6c92a8b2e44ff,653f7efbb9514ce7b81d44070d6208c1,"[1287, 459, 1333, 487]","{'size': [900, 1600], 'counts': 'YVBgUzEzUmwwM...",[],64ef47d5001f4fc898db81a66ddff437,64ef47d5001f4fc898db81a66ddff437,701edc492abd4e348de7e0cc81415b34,beeb80e92c584ab98c0e90e78818d389,992fb1a4038d59b9a4dedad8c6965eff,...,1600,900,2018-07-31 02:45:24.287005,True,44cbed56544a46d1ad6d9f35e678fce1,80f9bc994e07440298fce41009e167a3,653f7efbb9514ce7b81d44070d6208c1,movable_object.barrier,Temporary road barrier placed in the scene in ...,9
2,00e8e034537a4c1ba3f0f46f10d91166,fd69059b62a3469fbaef25340c0eab7f,"[314, 482, 501, 559]","{'size': [900, 1600], 'counts': 'ZlpiOTFTbDAwM...",[9f65c1eaa74e4d5db46e87a34811e994],6578fb45138743bfae01555ec16f59b3,6578fb45138743bfae01555ec16f59b3,eb223288d5c24cd8871b0e83181c422c,d44b482391f449589d97ddaf6fe348a8,bc48b9875aab5db4b9cb09e5f0f23367,...,1600,900,2018-07-05 09:21:40.037551,True,00c2214cca1a468e8c1a2c7d4f23ddea,81489c386bc9457289bf3ac997477e07,fd69059b62a3469fbaef25340c0eab7f,vehicle.car,"Vehicle designed primarily for personal use, e...",17
3,011a40843fc04274a1e8e96e757e471b,1fa93b757fc74fb197cdd60001ad8abf,"[30, 463, 57, 537]","{'size': [900, 1600], 'counts': 'WmtqMDRQMU1ra...",[60346f6d0fcd4d9eadd6f64c77dd1e93],8ac03b05cc764847826b9ca3571e0205,8ac03b05cc764847826b9ca3571e0205,264c9042e2264f3795e3b2debf581d2f,0bd8f1db12414221ae13c55c6708e385,306a07e312a95bd19b8363450c44eade,...,1600,900,2018-07-12 20:25:02.197158,True,3f5d15ec50b74765a03eb36ce55b7c99,d27d7480d4a5423c97414f3657ca20a5,1fa93b757fc74fb197cdd60001ad8abf,human.pedestrian.adult,Adult subcategory.,2
4,0157192019094b0783586f8084772f62,fd69059b62a3469fbaef25340c0eab7f,"[579, 464, 617, 480]","{'size': [900, 1600], 'counts': 'WFduPzRuazA0T...",[abc0f113547848a9baaa62096fca37f5],4279c39de30144b9984dd5c0735a8527,4279c39de30144b9984dd5c0735a8527,0fb93d1975cb4d389de86836d7e6ec36,a29ee351b81b476d92425a276fb395bc,aad797a273fe5050a9ade08890499675,...,1600,900,2018-09-11 08:56:05.112757,True,ff42c42f67744f05ba0df12e3a7a883e,9c781e7511c945cfb199a6ae1ed6f742,fd69059b62a3469fbaef25340c0eab7f,vehicle.car,"Vehicle designed primarily for personal use, e...",17


In [7]:
a = df_to_coco_format(val, '/home/akbar/Downloads/nuimages-v1.0-all-metadata/v1.0-val/', 'train')