In [1]:
import json
import csv
from os.path import join, dirname
import unicodedata

def load_disambiguation_mapping(mapping_path):
    with open(mapping_path, 'r') as file:
        reader = csv.DictReader(file)
        disambiguation_mapping = {row['char']: row['replacement'] for row in reader}
    return disambiguation_mapping

def disambiguate_label(label, disambiguation_mapping):
    print('Before:', label)
    result = ''.join([disambiguation_mapping.get(c, c) for c in unicodedata.normalize('NFC', label)])
    print('After: ', result)
    return result

def disambiguate_line(v, disambiguation_mapping):
    v['label'] = disambiguate_label(v['label'], disambiguation_mapping)
    return v

def extract_post_init(annotation, disambiguation_mapping):
    annotation = {k: disambiguate_line(v, disambiguation_mapping) for k, v in annotation.items()}
    data = [(k, split_combining_(v), v['split']) for k, v in annotation.items()]
    return data

def split_combining_(v):
    label = process_transcription(v['label'])
    return [c for c in unicodedata.normalize('NFC', label)]

def process_transcription(raw_transcription):
    transcription = raw_transcription.replace(space, '')
    if sep != '':
        transcription = transcription.split(sep)
    return transcription

In [4]:

# Load your annotation data
annotation_path = '/home/vlachoum/PROJECT/learnable-scriber/datasets/textualis_formata/annotation.json'
with open(annotation_path) as f:
    annotation_data = json.load(f)

# Load your disambiguation mapping
mapping_path = '/home/vlachoum/PROJECT/learnable-scriber/learnable_typewriter/data/disambiguation_table.csv'
disambiguation_mapping = load_disambiguation_mapping(mapping_path)

space = ' '  # or whatever your actual definition is
sep = ''  # or whatever your actual definition is

# Extract and disambiguate labels
extracted_data = extract_post_init(annotation_data, disambiguation_mapping)

Before: au sarrazins pour ce quen sa terre fu la ba
After:  au sarrazins pour ce quen sa terre fu la ba
Before: taille . ⁊ quil deuoit avoir la premiere poĩ
After:  taille . ⁊ quil deuoit avoir la premiere poĩ
Before: te . li quens de triple point seur les sarrazĩs
After:  te . li quens de triple point seur les sarrazĩs
Before: et si point en un pendant contreual . li sarra
After:  et si point en un pendant contreual . li sarra
Before: zin tantost cõme il fu outre se reclostrent et
After:  zin tantost cõme il fu outre se reclostrent et
Before: corurent sus le roi qui demorez estoit si le pri
After:  corurent sus le roi qui demorez estoit si le pri
Before: strent ⁊ touz ceus qui auec lui estoient fors
After:  strent ⁊ touz ceus qui auec lui estoient fors
Before: seulement ceus qui larrere garde fesoient
After:  seulement ceus qui larrere garde fesoient
Before: qui sen eschaperent. quant li quens de tͥple
After:  qui sen eschaperent. quant li quens de tͥple
Before: ot point ⁊ il uit que 

In [None]:
print(extracted_data)

In [13]:
import os
import json
import random

def split_data(json_data, train_percentage=0.85):
    # Step 1: Identify unique groups
    unique_groups = set(key.split('_')[0] for key in json_data.keys())

    # Step 2 and 3: Randomly split each group
    for group in unique_groups:
        group_keys = [key for key in json_data.keys() if key.startswith(group)]
        group_size = len(group_keys)
        train_size = int(group_size * train_percentage)

        # Randomly shuffle the keys to ensure a random split
        random.shuffle(group_keys)

        # Update "split" values for training and validation
        for key in group_keys[:train_size]:
            json_data[key]["split"] = "train"
        for key in group_keys[train_size:]:
            json_data[key]["split"] = "val"

    return json_data

def process_directory(directory_path):
    for root, dirs, files in os.walk(directory_path):
        for file_name in files:
            if file_name.endswith('.json'):
                file_path = os.path.join(root, file_name)
                print(f"Processing file: {file_path}")

                try:
                    # Load JSON data
                    with open(file_path, 'r', encoding='utf-8') as file:
                        json_data = json.load(file)

                    # Apply the split_data function
                    json_data = split_data(json_data)

                    # Save the updated JSON data for each file
                    with open(file_path, 'w', encoding='utf-8') as file:
                        json.dump(json_data, file, indent=2)

                except Exception as e:
                    print(f"Error processing file {file_path}: {e}")

# Replace 'your_directory_path' with the actual path to your directory
process_directory('./')

Processing file: ./cremma/annotation.json
Processing file: ./textualis_formata/annotation.json
Processing file: ./south_north_textualis/annotation.json
