In [None]:
# (Final Version)
# converting 4 individual JSON files into COCO format JSON files and combining them into 1 single COCO format JSON file

# importing required modules to work with json and directories
import os
import json

# defining the directory where the 4 JSON files are located
directory = '/input/test-data-noman/train-input-images-with-json/train-json/'

# creating a new dictionary in COCO format
coco_dict = {
    "info": {
        "year": "2023",
        "version": "1",
        "description": "Converted to COCO json format using custom python script",
        "contributor": "Md. Mutasim Billah Abu Noman Akanda",
        "date_created": "2023/03/20"
    },
    "licenses": [],  # do not need licensing here as we are not the owner of the data
    "categories": [  # image category is missing here based on our 4 image documents which is given below while performing basic EDA
        {
            "id": 0,
            "name": "TextBlock"
        },
        {
            "id": 1,
            "name": "paragraph"
        },
        {
            "id": 2,
            "name": "TableRegion"
        }
    ],
    "images": [],
    "annotations": [],
}

# iterating through each JSON file in the directory
image_id = 0
segment_id = 0
for filename in os.listdir(directory):
    if filename.endswith('.json'):
        # loading the JSON file
        with open(os.path.join(directory, filename), 'r') as f:
            data = json.load(f)

        # getting image information and adding to the COCO dictionary
        width = data['width']
        height = data['height']
        coco_dict['images'].append({
            "id": image_id,
            "file_name": filename,
            "width": width,
            "height": height
        })

        # iterating through the segments and adding to the COCO dictionary
        for segment_key, segment_value in data['segments'].items():
            # checking if the category exists in the COCO dictionary
            category_id = None
            for cat in coco_dict['categories']:
                if cat['name'] == segment_value['type']:
                    category_id = cat['id']
                    break
            # if the category does not exist, add a new category to the COCO dictionary
            if category_id is None:
                category_id = len(coco_dict['categories'])
                coco_dict['categories'].append({
                    "id": category_id,
                    "name": segment_value['type']
                })

            bbox = [
                segment_value['points'][0]['x'],  # xmin
                segment_value['points'][0]['y'],  # ymin
                segment_value['points'][2]['x'],  # xmax
                segment_value['points'][1]['y']   # ymax
            ]

            width = segment_value['points'][2]['x'] - segment_value['points'][0]['x']    # xmax - xmin = width
            height = segment_value['points'][1]['y'] - segment_value['points'][0]['y']   # ymax - ymin = height
            area = width * height

            coco_dict['annotations'].append({
                "id": segment_id,
                "image_id": image_id,
                "category_id": category_id,
                "bbox": bbox,
                "width" : width,
                "height" : height,
                "area": area,
                "iscrowd": 0   # no need here though as there is no overlapping categories
            })

            segment_id += 1

        image_id += 1

# Save the COCO dictionary as a JSON file
with open('converted_coco_format.json', 'w') as f:
    json.dump(coco_dict, f)

In [None]:
# draft version for 1 single json to coco format json

import json

# Load the JSON file
with open('/input/test-data-noman/train-input-images-with-json/train-json/0004b1c3-6260-49f1-bc82-6e6ae94b98f0.json', 'r') as f:
    data = json.load(f)
#     print(data)

# Create a new dictionary in COCO format
coco_dict = {
    "info": {
        "description": "Converted to COCO json using python script",
        "version": "1.0",
        "year": 2023,
        "contributor": "Md. Mutasim Billah Abu Noman Akanda",
        "date_created": "2023/03/20"
    },
    "licenses": [],
    "categories": [
        {
            "id": 0,
            "name": "TextBlock"
        },
        {
            "id": 1,
            "name": "paragraph"
        },
        {
            "id": 2,
            "name": "TableRegion"
        }
    ],
    "images": [],
    "annotations": [],
}

# Get image information and add to the COCO dictionary
filename = data['name']
# print(filename)
width = data['width']
# print(width)
height = data['height']
# print(height)
image_id = 0  # Change this if you have multiple images
coco_dict['images'].append({
    "id": image_id,
    "width": width,
    "height": height,
    "file_name": filename
})

# print(coco_dict)

# Iterate through the segments and add to the COCO dictionary
segment_id = 0
for segment_key, segment_value in data['segments'].items():
    category_id = [cat['id'] for cat in coco_dict['categories'] if cat['name'] == segment_value['type']][0]
#     print("segment_key : ", segment_key)
#     print("segment_value : ", segment_value)
#     print("category_id : ", category_id)
#     print("Segment value : ", segment_value['points'][1]['y'])

    bbox = [
        segment_value['points'][0]['x'],
        segment_value['points'][0]['y'],
        segment_value['points'][2]['x'],
        segment_value['points'][1]['y']
    ]
    
#     print(bbox)
    width = segment_value['points'][2]['x'] - segment_value['points'][0]['x']
    height = segment_value['points'][1]['y'] - segment_value['points'][0]['y']
    area = width * height
#     print("Width = ", width)
#     print("height = ", height)
#     print("Area = ", area)

    coco_dict['annotations'].append({
        "id": segment_id,
        "image_id": image_id,
        "category_id": category_id,
        "bbox": bbox,
        "width" : width,
        "height" : height,
        "area": area,
        "iscrowd": 0
    })
    
#     print("\n\n\nAnnotations = ", coco_dict)

    segment_id += 1

# Save the COCO dictionary as a JSON file
with open('0004b1c3-6260-49f1-bc82-6e6ae94b98f0.json', 'w') as f:
    json.dump(coco_dict, f)

In [None]:
# draft version of basic EDA for checking unique types of labels

import json
import os

# setting the path to the directory containing the JSON files
json_dir = '/input/test-data-noman/train-input-images-with-json/train-json'

# create an empty set to store the document types or labels
document_types = set()

# getting a list of all the JSON files in the directory
json_files = [os.path.join(json_dir, f) for f in os.listdir(json_dir) if f.endswith('.json')]

# looping through each JSON file and extract the document types from the 'segments' key
for json_file in json_files:
    with open(json_file, 'r') as f:
        data = json.load(f)
        print(f, " is loaded")
        segments = data['segments']
        for segment_id, segment_data in segments.items():
            document_type = segment_data['type']
    
            document_types.add(document_type)

# printing the set of document types
print("\nUnique document types are: ", document_types)

<_io.TextIOWrapper name='/kaggle/input/test-data-noman/train-input-images-with-json/train-json/0004b1c3-6260-49f1-bc82-6e6ae94b98f0.json' mode='r' encoding='UTF-8'>  is loaded
<_io.TextIOWrapper name='/kaggle/input/test-data-noman/train-input-images-with-json/train-json/00b6fc4e-0263-456a-a7a8-d183d4877c49.json' mode='r' encoding='UTF-8'>  is loaded
<_io.TextIOWrapper name='/kaggle/input/test-data-noman/train-input-images-with-json/train-json/003ba068-116f-461d-9d70-204204e3b61a.json' mode='r' encoding='UTF-8'>  is loaded
<_io.TextIOWrapper name='/kaggle/input/test-data-noman/train-input-images-with-json/train-json/025c9854-c6dd-442b-be91-33434fd3f603.json' mode='r' encoding='UTF-8'>  is loaded

Unique document types are:  {'TableRegion', 'paragraph', 'TextBlock'}


In [None]:
# draft version for calculating the number of each labels 

# creating an empty list to store all the document types
document_types = []
count_TextBlock = 0
count_paragraph = 0
count_TableRegion = 0

# getting a list of all the JSON files in the directory
json_files = [os.path.join(json_dir, f) for f in os.listdir(json_dir) if f.endswith('.json')]

# looping through each JSON file and extract the document types from the 'segments' key
for json_file in json_files:
    with open(json_file, 'r') as f:
        data = json.load(f)
        segments = data['segments']
        for segment_id, segment_data in segments.items():
            document_type = segment_data['type']
            document_types.append(document_type)
            if document_type == "TextBlock":
                count_TextBlock += 1
            elif document_type == "paragraph":
                count_paragraph += 1
            else:
                count_TableRegion += 1

# printing the set of document types
print("\nTotal annotations: ", len(document_types))
print("\nTextblock : ", count_TextBlock)
print("\nParagraph : ", count_paragraph)
print("\nTableRegion : ", count_TableRegion)


Total annotations:  910

Textblock :  903

Paragraph :  6

TableRegion :  1


In [None]:
# draft version for merging the JSON files into 1 single JSON file

# creating empty dictionary to store merged data
merged_data = {"documents": []}

# looping through each JSON file
for filename in os.listdir(json_dir):
    if filename.endswith(".json"):
        filepath = os.path.join(json_dir, filename)

        # loading the contents of the current JSON file into a dictionary
        with open(filepath, "r") as f:
            data = json.load(f)

        # looping through each segment in the current dictionary
        for segment in data["segments"].values():
            # adding the segment to the merged data dictionary
            merged_data["documents"].append(segment)

# writing the merged data to a new JSON file
with open("train_merged_data.json", "w") as f:
    json.dump(merged_data, f)
    
print("Merged data saved to /input/merged-data/train_merged_data.json")

Merged data saved to /kaggle/input/merged-data/train_merged_data.json


In [None]:
import json

# Load the merged JSON file
with open("/working/train_merged_data.json") as f:
    data = json.load(f)

# Count the number of TextBlock, Paragraph, and TableRegion segments
textblock_count = 0
paragraph_count = 0
tableregion_count = 0

for document in data['documents']:
    if document['type'] == 'TextBlock':
        textblock_count += 1
    elif document['type'] == 'paragraph':
        paragraph_count += 1
    elif document['type'] == 'TableRegion':
        tableregion_count += 1

print(f"Number of TextBlock segments: {textblock_count}")
print(f"Number of Paragraph segments: {paragraph_count}")
print(f"Number of TableRegion segments: {tableregion_count}")

Number of TextBlock segments: 903
Number of Paragraph segments: 6
Number of TableRegion segments: 1
