In [38]:
from doctr.models import ocr_predictor
from doctr.io import DocumentFile
from datetime import datetime
import json
from PIL import Image
import os
import random
import shutil
import math

# on windows machine potentially necessary
#import os
# insert the GTK3 Runtime folder at the beginning. Can be bin or lib, depending on path you choose while installing.
#GTK_FOLDER = r'C:\Program Files\GTK3-Runtime Win64\bin'
#os.environ['PATH'] = GTK_FOLDER + os.pathsep + os.environ.get('PATH', '')

data_dir = 'data'
annotations_dir = f'{data_dir}/annotations'
# Ensure the 'annotations' directory exists
if not os.path.exists(annotations_dir):
    os.makedirs(annotations_dir)
orig_images_path = f'{data_dir}/orig_images'
image_paths = os.listdir(orig_images_path)
image_paths = [os.path.join(orig_images_path,i) for i in image_paths]
labels_file = os.path.join(annotations_dir, 'labels.json')

In [39]:
# # Initialize a dictionary to hold the JSON structure for labels
# labels = {}

# # Load model(s)
# model = ocr_predictor(det_arch='db_resnet50', reco_arch='parseq', pretrained=True)

# for image_path in image_paths: 
#     name_of_label = image_path.split("/")[-1].split("_")[0]
#     img = Image.open(image_path)
#     # Check if the image is in landscape (width > height)
#     if img.width > img.height:
#         # Rotate the image by 270 degrees clockwise
#         img = img.transpose(Image.Transpose.ROTATE_270)
#     doc = DocumentFile.from_images(image_path)
#     # Analyze
#     result = model(doc)
#     data = result.export()
#     data = data["pages"][0]

#     # Loop through each block, line, and word to extract the words and create labels
#     for block in data['blocks']:
#         for line in block['lines']:
#             for word in line['words']:
#                 # Get the word coordinates
#                 coords = word['geometry']
#                 # Calculate the coordinates on the image
#                 left = img.width * coords[0][0]
#                 top = img.height * coords[0][1]
#                 right = img.width * coords[1][0]
#                 bottom = img.height * coords[1][1]
                
#                 # Crop the image to the word
#                 word_img = img.crop((left, top, right, bottom))
                
#                 # Generating a timestamp with the current hour, minute, second, milliseconds as a string
#                 # Get the current time
#                 current_time_precise = datetime.now()
                
#                 # Format the time to include hour, minute, second, milliseconds, and day
#                 timestamp_precise = current_time_precise.strftime("%H%M%S%f")
                
#                 # Define the filename with suffix
#                 filename = f"img_{name_of_label}_{timestamp_precise}.png"
                
#                 # Save the cropped image with the new filename
#                 word_img.save(f"{annotations_dir}/{filename}")
                
#                 # Add the entry to the labels dictionary
#                 labels[filename] = name_of_label
                
# # Save the updated labels dictionary back to the JSON file
# with open(labels_file, 'w') as f:
#     json.dump(labels, f)

In [40]:
# manual labor: filter out "dirty annotations".

In [41]:
# Load the labels from the labels.json file
with open(labels_file, 'r') as f:
    labels = json.load(f)

# Get a list of all filenames in the 'annotations' directory
existing_files = os.listdir(annotations_dir)

# Remove keys from the dictionary for files that don't exist
labels = {filename: word for filename, word in labels.items() if filename in existing_files}

# Save the updated labels dictionary back to the JSON file
with open(labels_file, 'w') as f:
    json.dump(labels, f)
# The labels dictionary is now updated with only existing files.


In [42]:
# Create train and validation folders if they don't exist
train_folder = os.path.join(data_dir, 'train_path')
val_folder = os.path.join(data_dir, 'val_path')
train_folder_images = os.path.join(train_folder, 'images')
val_folder_images = os.path.join(val_folder, 'images')
os.makedirs(train_folder, exist_ok=True)
os.makedirs(val_folder, exist_ok=True)
os.makedirs(train_folder_images, exist_ok=True)
os.makedirs(val_folder_images, exist_ok=True)

# List all the image files in the folder
image_files = os.listdir(annotations_dir)
image_files = [i for i in image_files if i!="labels.json"]

# Create dictionaries to keep track of counts for each category
train_counts = {}
val_counts = {}

# Define the desired train-validation split ratio
split_ratio = 0.8

# Iterate through the image files
for image_file in image_files:
    # Extract the name portion of the filename (e.g., "img_peter")
    name = image_file.split('_')[0]

    # Determine if the image should go to the train or validation set
    if name not in train_counts:
        train_counts[name] = 0
        val_counts[name] = 0

    if train_counts[name] / (train_counts[name] + val_counts[name] + 1) < split_ratio:
        train_counts[name] += 1
        destination_folder = train_folder_images
    else:
        val_counts[name] += 1
        destination_folder = val_folder_images

    # Create a copy of the image file in the appropriate folder
    source_path = os.path.join(annotations_dir, image_file)
    destination_path = os.path.join(destination_folder, image_file)
    shutil.copy2(source_path, destination_path)

In [26]:
# for i in [(train_folder, train_folder_images), (val_folder, val_folder_images)]:
#     # Load the labels from the labels.json file
#     with open(labels_file, 'r') as f:
#         labels = json.load(f)
    
#     # Get a list of all filenames in the 'annotations' directory
#     existing_files = os.listdir(i[1])
    
#     # Remove keys from the dictionary for files that don't exist
#     labels = {filename: word for filename, word in labels.items() if filename in existing_files}
    
#     labels_file_new = os.path.join(i[0], 'labels.json')
    
#     # Save the updated labels dictionary back to the JSON file
#     with open(labels_file_new, 'w') as f:
#         json.dump(labels, f)
#     # The labels dictionary is now updated with only existing files.


## Labels from user entries

In [43]:
user_entry_annotations_dir = f'{data_dir}/user_entries/annotations'
labels_file_user = os.path.join(user_entry_annotations_dir, 'labels.json')

In [44]:
# List all the image files in the folder
image_files = os.listdir(user_entry_annotations_dir)
image_files = [i for i in image_files if i!="labels.json"]

# Create dictionaries to keep track of counts for each category
train_counts = {}
val_counts = {}

# Define the desired train-validation split ratio
split_ratio = 0.8

# Iterate through the image files
for image_file in image_files:
    # Extract the name portion of the filename (e.g., "img_peter")
    name = image_file.split('_')[1]

    # Determine if the image should go to the train or validation set
    if name not in train_counts:
        train_counts[name] = 0
        val_counts[name] = 0

    if train_counts[name] / (train_counts[name] + val_counts[name] + 1) < split_ratio:
        train_counts[name] += 1
        destination_folder = train_folder_images
    else:
        val_counts[name] += 1
        destination_folder = val_folder_images

    # Create a copy of the image file in the appropriate folder
    source_path = os.path.join(user_entry_annotations_dir, image_file)
    destination_path = os.path.join(destination_folder, image_file)
    shutil.copy2(source_path, destination_path)

In [45]:
# Paths to JSON files
first_json_path = f'{user_entry_annotations_dir}/labels.json'
second_json_path = f'{annotations_dir}/labels.json'
combined_json_path_train = f'{train_folder}/labels.json'
combined_json_path_val = f'{val_folder}/labels.json'

# Read the content of the first JSON file
with open(first_json_path, 'r') as file:
    first_data = json.load(file)

# Read the content of the second JSON file
with open(second_json_path, 'r') as file:
    second_data = json.load(file)

# Combine the data
combined_data = {**first_data, **second_data}

# Write the combined data to a new JSON file
with open(combined_json_path_train, 'w') as file:
    json.dump(combined_data, file, indent=4)
with open(combined_json_path_val, 'w') as file:
    json.dump(combined_data, file, indent=4)



In [46]:
for i in [(train_folder, train_folder_images), (val_folder, val_folder_images)]:
    # Load the labels from the labels.json file
    labels_file_new = os.path.join(i[0], 'labels.json')
    with open(labels_file_new, 'r') as f:
        labels = json.load(f)
    
    # Get a list of all filenames in the 'annotations' directory
    existing_files = os.listdir(i[1])
    
    # Remove keys from the dictionary for files that don't exist
    labels = {filename: word for filename, word in labels.items() if filename in existing_files}
    
    labels_file_new = os.path.join(i[0], 'labels.json')
    
    # Save the updated labels dictionary back to the JSON file
    with open(labels_file_new, 'w') as f:
        json.dump(labels, f)
    # The labels dictionary is now updated with only existing files.
