### **Match the Copiale competition images to their decryptions**

This file does the following tasks:
- Map new manuscript image names to old names, these old names contain the page and line number of the original manuscript
- Find the equivalent decryption of these lines in the according .txt file for the deciphered manuscript
- Add these decryptions to the manuscript json file unifying: transcription, decryption and copiale font translations

In [8]:
import json
import csv
import re
from typing import Dict, List, Optional

def parse_manuscript_file(txt_file_path: str) -> Dict[int, List[str]]:
    """
    Parse the deciphered manuscript text file and organize lines by page number.
    
    Args:
        txt_file_path: Path to the manuscript .txt file
        
    Returns:
        Dictionary with page numbers as keys and lists of lines as values
    """
    pages = {}
    current_page = None
    
    with open(txt_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.rstrip('\n\r')
            
            # Check for page marker (e.g., "## PAGE 57")
            page_match = re.match(r'##\s*PAGE\s+(\d+)', line.strip())
            if page_match:
                current_page = int(page_match.group(1))
                pages[current_page] = []
                continue
            
            # Skip empty lines and lines with just "#"
            if not line.strip() or line.strip() == '#':
                continue
                
            # Add line to current page if we have one
            if current_page is not None:
                pages[current_page].append(line.strip())
    
    return pages

def load_csv_mapping(csv_file_path: str) -> Dict[str, str]:
    """
    Load the CSV file that maps new image names to old names.
    
    Args:
        csv_file_path: Path to the CSV mapping file
        
    Returns:
        Dictionary mapping new names to old names
    """
    mapping = {}
    
    with open(csv_file_path, 'r', encoding='utf-8') as file:
        # Try to detect the CSV format
        sample = file.read(1024)
        file.seek(0)
        
        # Check if it has headers
        sniffer = csv.Sniffer()
        has_header = sniffer.has_header(sample)
        
        reader = csv.reader(file)
        
        if has_header:
            headers = next(reader)
            # Find the column indices for new_name and old_name
            new_name_idx = None
            old_name_idx = None
            
            for i, header in enumerate(headers):
                if 'new' in header.lower() and 'name' in header.lower():
                    new_name_idx = i
                elif 'old' in header.lower() and 'name' in header.lower():
                    old_name_idx = i
            
            if new_name_idx is None or old_name_idx is None:
                print("Warning: Could not find 'new_name' and 'old_name' columns in headers")
                print(f"Headers found: {headers}")
                # Assume first two columns are new_name, old_name
                new_name_idx, old_name_idx = 0, 1
        else:
            # Assume first two columns are new_name, old_name
            new_name_idx, old_name_idx = 0, 1
        
        for row in reader:
            if len(row) > max(new_name_idx, old_name_idx):
                new_name = row[new_name_idx].strip()
                old_name = row[old_name_idx].strip()
                
                # Ensure new_name has .png extension
                if not new_name.endswith('.png'):
                    new_name += '.png'
                    
                mapping[new_name] = old_name
    
    return mapping

def parse_old_name(old_name: str) -> Optional[tuple]:
    """
    Parse the old name format (e.g., "58_12") to extract page and line numbers.
    
    Args:
        old_name: String in format "page_line"
        
    Returns:
        Tuple of (page_number, line_number) or None if parsing fails
    """
    try:
        parts = old_name.split('_')
        if len(parts) == 2:
            page_num = int(parts[0])
            line_num = int(parts[1])
            return page_num, line_num
    except (ValueError, IndexError):
        pass
    return None

def get_line_from_page(pages: Dict[int, List[str]], page_num: int, line_num: int) -> Optional[str]:
    """
    Get a specific line from a specific page.
    
    Args:
        pages: Dictionary of pages and their lines
        page_num: Page number
        line_num: Line number (0-indexed)
        
    Returns:
        The line text or None if not found
    """
    if page_num not in pages:
        return None
    
    page_lines = pages[page_num]
    if 0 <= line_num < len(page_lines):
        return page_lines[line_num]
    
    return None

def add_plaintext_to_json(json_file_path: str, csv_file_path: str, txt_file_path: str, output_file_path: str):
    """
    Main function to add plaintext to the JSON file.
    
    Args:
        json_file_path: Path to the existing JSON file
        csv_file_path: Path to the CSV mapping file
        txt_file_path: Path to the manuscript text file
        output_file_path: Path where to save the updated JSON
    """
    print("Loading and parsing files...")
    
    # Load existing JSON
    with open(json_file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    # Parse manuscript pages
    pages = parse_manuscript_file(txt_file_path)
    print(f"Parsed {len(pages)} pages from manuscript")
    
    # Load CSV mapping
    mapping = load_csv_mapping(csv_file_path)
    print(f"Loaded {len(mapping)} mappings from CSV")
    
    # Process each image in the JSON
    processed_count = 0
    missing_count = 0
    error_count = 0
    
    for image_name in data.keys():
        if image_name in mapping:
            old_name = mapping[image_name]
            parsed = parse_old_name(old_name)
            
            if parsed:
                page_num, line_num = parsed
                plaintext = get_line_from_page(pages, page_num, line_num)
                
                if plaintext is not None:
                    data[image_name]['plaintext'] = plaintext
                    processed_count += 1
                else:
                    print(f"Warning: Could not find line {line_num} on page {page_num} for {image_name}")
                    missing_count += 1
            else:
                print(f"Error: Could not parse old_name '{old_name}' for {image_name}")
                error_count += 1
        else:
            print(f"Warning: No mapping found for {image_name}")
            missing_count += 1
    
    # Save updated JSON
    with open(output_file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=2, ensure_ascii=False)
    
    print(f"\nProcessing complete!")
    print(f"Successfully processed: {processed_count}")
    print(f"Missing/not found: {missing_count}")
    print(f"Errors: {error_count}")
    print(f"Updated JSON saved to: {output_file_path}")

In [9]:
def main():
    """
    Example usage of the script.
    Update the file paths below to match your actual files.
    """
    # File paths - update these to match your actual file locations
    json_file_path = "/home/moliveros/Datasets/copialeManuscript/copiale_dataset.json"  # Your existing JSON file
    csv_file_path = "/home/moliveros/Datasets/copiale-decryptedText/task2_names.csv"   # Your CSV mapping file
    txt_file_path = "/home/moliveros/Datasets/copiale-deciphered.txt"  # Your manuscript text file
    output_file_path = "/home/moliveros/Datasets/copialeManuscriptWithDecryption/copiale_dataset_with_decryption.json"  # Output file
    
    try:
        add_plaintext_to_json(json_file_path, csv_file_path, txt_file_path, output_file_path)
    except FileNotFoundError as e:
        print(f"Error: File not found - {e}")
        print("Please update the file paths in the main() function to match your actual files.")
    except Exception as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    main()

Loading and parsing files...
Parsed 105 pages from manuscript
Loaded 4985 mappings from CSV

Processing complete!
Successfully processed: 1493
Missing/not found: 9
Errors: 0
Updated JSON saved to: /home/moliveros/Datasets/copialeManuscriptWithDecryption/copiale_dataset_with_decryption.json
