In [1]:
#!pip install -U pandas==1.1.4 --user
#!pip install swifter --user

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import json
import pandas as pd
import math
import swifter

In [20]:
def getImageDataTable(annotationsJsonPath):
    
    with open(annotationsJsonPath) as json_file:
        data = json.load(json_file)
        table = pd.DataFrame.from_records(data["images"])
        
    def calculateDimensions(row):
        row["width_bigger"] = row["width"] >= row["height"]
        if row["width_bigger"]:
            row["ratio"] = min(1333/row["width"], 800/row["height"])
        else:
            row["ratio"] = min(1333/row["height"], 800/row["width"])
        
        row["scaled_width"] = math.floor(row["width"]*row["ratio"])
        row["scaled_height"] = math.floor(row["height"]*row["ratio"])
        
        if row["width_bigger"]:
            paddedWidth = 1333-row["scaled_width"]
            paddedHeight = 800-row["scaled_height"]
        else:
            paddedWidth = 800-row["scaled_width"]
            paddedHeight = 1333-row["scaled_height"]
            
        row["padding_left"] = paddedWidth//2
        row["padding_right"] = paddedWidth-(paddedWidth//2)
        row["padding_top"] = paddedHeight//2
        row["padding_bottom"] = paddedHeight-(paddedHeight//2)
        return row
    
    table = table.swifter.apply(calculateDimensions, axis=1)
    
    return table.drop(columns=["license", "coco_url", "date_captured", "flickr_url"])

def getLabelDataTable(annotationsJsonPath, imagesTable):
    with open(annotationsJsonPath) as json_file:
        data = json.load(json_file)
        annotations = pd.DataFrame.from_records(data["annotations"])
        categories = pd.DataFrame.from_records(data["categories"])
    
    table = annotations.drop(columns=["segmentation", "area", "iscrowd"])
    
    def formatBbox(row):
        imageSeries = imagesTable[imagesTable["id"] == row["image_id"]].iloc[0]
        
        row["box_x"] = row["bbox"][0]
        row["box_y"] = row["bbox"][1]
        row["box_width"] = row["bbox"][2]
        row["box_height"] = row["bbox"][3]
        
        row["box_center_x"] = row["box_x"] + row["box_width"]//2
        row["box_center_y"] = row["box_y"] + row["box_height"]//2
        
        row["scaled_box_center_x"] = math.floor(imageSeries["ratio"]*row["box_center_x"]) + imageSeries["padding_left"]
        row["scaled_box_center_y"] = math.floor(imageSeries["ratio"]*row["box_center_y"]) + imageSeries["padding_top"]
        row["scaled_box_width"] = math.floor(imageSeries["ratio"]*row["box_width"])
        row["scaled_box_height"] = math.floor(imageSeries["ratio"]*row["box_height"])
        
        if imageSeries["width_bigger"]:
            row["normalized_box_center_x"] = row["scaled_box_center_x"]/1333
            row["normalized_box_center_y"] = row["scaled_box_center_y"]/800
            row["normalized_box_width"] = row["scaled_box_width"]/1333
            row["normalized_box_height"] = row["scaled_box_height"]/800
        else:
            row["normalized_box_center_x"] = row["scaled_box_center_x"]/800
            row["normalized_box_center_y"] = row["scaled_box_center_y"]/1333
            row["normalized_box_width"] = row["scaled_box_width"]/800
            row["normalized_box_height"] = row["scaled_box_height"]/1333
        
        return row
    
    table = table.swifter.apply(formatBbox, axis=1).drop(columns=["bbox"])
    
    table = table.merge(categories, left_on="category_id", right_on="id").drop(columns="id_y")
    
    table = table.rename(columns={"id_x": "id", "name": "category"})
    
    return table



In [22]:
trainImagesTable = getImageDataTable("../data/annotations/instances_train2017.json")
trainImagesTable.to_csv("../data/annotations/cleaned_train_image_data.csv", index=False)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=118287.0, style=ProgressStyle(descript…




In [23]:
trainLabelsTable = getLabelDataTable("../data/annotations/instances_train2017.json", trainImagesTable)
trainLabelsTable.to_csv("../data/annotations/cleaned_train_labels_data.csv", index=False)
del trainImagesTable
del trainLabelsTable

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=860001.0, style=ProgressStyle(descript…




In [18]:
valImagesTable = getImageDataTable("../data/annotations/instances_val2017.json")
valImagesTable.to_csv("../data/annotations/cleaned_val_image_data.csv", index=False)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=5000.0, style=ProgressStyle(descriptio…




In [21]:
valLabelsTable = getLabelDataTable("../data/annotations/instances_val2017.json", valImagesTable)
valLabelsTable.to_csv("../data/annotations/cleaned_val_labels_data.csv", index=False)
del valImagesTable
del valLabelsTable

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=36781.0, style=ProgressStyle(descripti…


