# Import

In [1]:
import os
from PIL import Image
import re
import shutil
import json
import random
import pandas as pd

# Folder renaming

In [2]:
base_path = 'VatLib.Borg.Copt'
to_replace = 'Borg.copt.'
def rename_files_and_dirs(base_path):
    # Walk through all directories and files in the base path
    for root, dirs, files in os.walk(base_path, topdown=False):
        # Rename files
        for file_name in files:
            if to_replace in file_name:
                new_name = file_name.replace(to_replace, '')
                old_file = os.path.join(root, file_name)
                new_file = os.path.join(root, new_name)
                os.rename(old_file, new_file)
                print(f'Renamed file: {old_file} -> {new_file}')

        # Rename directories (we must do this after files, so we walk bottom-up)
        for dir_name in dirs:
            if to_replace in dir_name:
                new_name = dir_name.replace(to_replace, '')
                old_dir = os.path.join(root, dir_name)
                new_dir = os.path.join(root, new_name)
                os.rename(old_dir, new_dir)
                print(f'Renamed directory: {old_dir} -> {new_dir}')
                
rename_files_and_dirs(base_path)

# Dataset explorer

In [3]:
# matching_strs = ["{:05d}{}".format(i, side) for i in range(1000) for side in ['r', 'v']]
pattern = re.compile(r"[0-9]\d{2}[rv]")

In [None]:
to_keep_dir = 'coptic_dataset'
to_delete_dir = 'trash'

# Create directories if they don't exist
os.makedirs(to_keep_dir, exist_ok=True)
os.makedirs(to_delete_dir, exist_ok=True)

to_keep = []
to_delete = []

for collection in os.listdir(base_path):
    collection_path = os.path.join(base_path, collection)

    if not os.path.isdir(collection_path):
        continue  # Skip non-directory entries

    for coll_path, _, files in os.walk(collection_path):
        for file in files:
            src_file = os.path.join(coll_path, file)

            # Compute the relative path to maintain folder structure
            relative_path = os.path.relpath(coll_path, base_path)

            if pattern.search(file):
                dest_dir = os.path.join(to_keep_dir, relative_path)
                to_keep.append(src_file)
            else:
                dest_dir = os.path.join(to_delete_dir, relative_path)
                to_delete.append(src_file)

            # Create the directory only if there is at least one file
            os.makedirs(dest_dir, exist_ok=True)
            shutil.copy2(src_file, os.path.join(dest_dir, file))

print(f"Copied {len(to_keep)} files to 'to_keep' directory.")
print(f"Copied {len(to_delete)} files to 'to_delete' directory.")

len(to_keep), len(to_delete)

# Create Annotations

In [6]:
to_keep_dir = '../coptic_dataset'
collections = os.listdir(to_keep_dir)
n_collections = len(collections)
n_collections

len_x_collection = {collection: len(os.listdir(os.path.join(to_keep_dir, collection))) for collection in collections}
n_images = sum(len_x_collection.values())
df = pd.DataFrame(list(len_x_collection.items()), columns=['Document', 'N images'])
# df.to_csv("collection_occurrences.csv", index=False)

n_images

6677

## Split on images

In [7]:
def split_dataset(n_images, split_percentages):
    indices = list(range(n_images))
    random.shuffle(indices)

    train_size = int(n_images * split_percentages['train'])
    val_size = int(n_images * split_percentages['val'])

    train_ids = indices[:train_size]
    val_ids = indices[train_size:train_size + val_size]
    test_ids = indices[train_size + val_size:]

    return train_ids, val_ids, test_ids

split_percentages = {
    'train': 0.6,
    'val': 0.1,
    'test': 0.3,
}

# Ensure reproducibility (optional)
random.seed(42)

train_ids, val_ids, test_ids = split_dataset(n_images, split_percentages)
len(train_ids), len(val_ids), len(test_ids), len(set(train_ids + val_ids + test_ids))

(4006, 667, 2004, 6677)

In [10]:
annotation_path = 'annotations/coptic_dataset.json'
image_id = 0

data = {
    'images': []
}

for collection in os.listdir(to_keep_dir):
    collection_path = os.path.join(to_keep_dir, collection)
    
    if not os.path.isdir(collection_path):
        continue  # Skip non-directory entries

    for coll_path, _, files in os.walk(collection_path):
        for file in files:
            if image_id in train_ids:
                split = 'train'
            elif image_id in val_ids:
                split = 'val'
            elif image_id in test_ids:
                split = 'test'
            else:
                raise ("Error")
            
            data['images'].append(
                {
                    'id': image_id,
                    'filename': file,
                    'collection': collection,
                    'filepath': os.path.join(collection, file),
                    'split': split
                }
            )
            
            image_id += 1
            

with open(annotation_path, 'w') as f:
    json.dump(data, f)

# Dataset statistics

In [2]:
annotation_path = 'annotations/coptic_dataset.json'
with open(annotation_path, 'r') as f:
    data = json.load(f)

In [3]:
import re

def find_recto_verso_pairs_single_collection(filenames):
    page_map = {}

    for i, name in enumerate(filenames):
        match = re.search(r'_(\d{4})([rv])(?=[_.])', name)
        if match:
            page_num, side = match.groups()
            # key = name[:match.span()[1] - 1] + '_' + name[match.span()[1]:]
            key = name[match.span()[0] - 3:match.span()[1] - 1]
            if key not in page_map:
                page_map[key] = {}
            page_map[key][side] = name

    pairs = []
    for page_num, sides in page_map.items():
        if 'r' in sides and 'v' in sides:
            pairs.append((sides['r'], sides['v']))

    return pairs

def find_recto_verso_pairs(data):
    coll_filenames = {}
    for img in data['images']:
        coll_filenames[img['collection']] = coll_filenames.setdefault(img['collection'], []) + [img['filename']]

    coll_filenames = {k: [s for s in v] for k, v in coll_filenames.items()}
    pairs = [x for f in coll_filenames.values() for x in find_recto_verso_pairs_single_collection(f)]

    # Create a lookup dictionary for fast index retrieval
    filename_to_index = {img['filename']: idx for idx, img in enumerate(data['images'])}

    # Convert pairs to index pairs
    index_pairs = [(filename_to_index[a], filename_to_index[b]) for a, b in pairs]
    
    return index_pairs

In [4]:
def get_statistics(data, split=None, include_train_couples=False, include_val_couples=False, verbose=True):
    splits = [split] if type(split) is not list else split
    if include_train_couples:
        splits_supp = ['train']
        if include_val_couples:
            splits_supp += ['val']
        train_positive_couples, train_negative_couples, data = get_statistics(data, split=splits_supp, include_train_couples=False, verbose=False)
        splits += splits_supp
    # split_data['images'] = [elem for elem in data['images'] if split is None or elem['split'] in splits]

    categories = [elem['collection'] for elem in data['images']]
    splits_list = [elem['split'] for elem in data['images']]

    # Generate all pairs (positive and negative)
    positive_pairs = [(i, j) for i in range(len(categories)) for j in range(len(categories)) if i < j and categories[i] == categories[j] and (splits_list[i] in splits and splits_list[j] in splits)]
    negative_pairs = [(i, j) for i in range(len(categories)) for j in range(len(categories)) if i < j and categories[i] != categories[j] and (splits_list[i] in splits and splits_list[j] in splits)]

    if include_train_couples:
        positive_pairs = list(set(positive_pairs) - set(train_positive_couples))
        negative_pairs = list(set(negative_pairs) - set(train_negative_couples))
    
        # deleting recto and verso couples
        positive_pairs = list(set(positive_pairs) - set(find_recto_verso_pairs(data)))
    len_x_collection = {}
    for img in data['images']:
        len_x_collection[img['collection']] = len_x_collection.get(img['collection'], 0) + 1

    n_images = sum(len_x_collection.values())
    df = pd.DataFrame(list(len_x_collection.items()), columns=['Document', 'N images'])

    # print(df)
    if verbose:
        print(f"N. images: {n_images}")
        print(f"N. positives: {len(positive_pairs)}")
        print(f"N. negatives: {len(negative_pairs)}")
        print(f"N. Total {len(positive_pairs) + len(negative_pairs)}")
    else:
        return positive_pairs, negative_pairs, data
    

print("****Train****")
get_statistics(data, 'train', include_train_couples=False)
print("****Val****")
get_statistics(data, 'val', include_train_couples=True, include_val_couples=False)
print("****Test****")
get_statistics(data, 'test', include_train_couples=True, include_val_couples=True)

****Train****
N. images: 6677
N. positives: 592694
N. negatives: 7429321
N. Total 8022015
****Val****
N. images: 6677
N. positives: 220540
N. negatives: 2673363
N. Total 2893903
****Test****
N. images: 6677
N. positives: 855307
N. negatives: 10515575
N. Total 11370882


In [5]:
total_poss = []
total_negs = []
for split in ['train', 'val', 'test']:
    poss, negs, _ = get_statistics(data, split, include_train_couples=split != 'train', include_val_couples=split == 'test', verbose=False)
    total_poss += poss
    total_negs += negs
print("****Total****")
print(f"N. positives {len(total_poss)}")
print(f"N. negatives {len(total_negs)}")

****Total****
N. positives 1668541
N. negatives 20618259
