### Package Importer

In [None]:
!pip install requests numpy matplotlib tqdm beautifulsoup4 Pillow

import os
import requests
import json
from tqdm.notebook import tqdm
from torchvision import transforms
import base64
import numpy as np
import matplotlib.pyplot as plt
import random
import torch
import zipfile
from io import BytesIO
from tqdm import tqdm
from bs4 import BeautifulSoup
from PIL import Image, ImageDraw, ImageFont
from google.colab import drive, files

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

drive.mount('/content/drive')

Using device: cpu
Mounted at /content/drive


# **Data Loading Functions**

## 1. Indus Script Acquisition

Automates the retrieval and processing of Indus script sign data  
- Downloads an HTML dataset from a GitHub repository and parses it using BeautifulSoup.  
- Extracts sign codes and corresponding images from a structured table.  
- Filters entries based on occurrence frequency (threshold > 2) to retain relevant samples.  
- Authenticates and downloads images from a restricted-access database.  
- Saves the processed dataset and logs key metadata for verification.  


In [None]:
def ensure_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Created directory: {directory}")
    else:
        print(f"Directory already exists: {directory}")

output_dir = "/content/drive/MyDrive/script_analysis/indus"
ensure_dir(output_dir)

repo_owner = "oohalakkadi"
repo_name = "ivc2tyc"
branch = "main"
file_path = "reference/indus/ICIT.html"

raw_url = f"https://raw.githubusercontent.com/{repo_owner}/{repo_name}/{branch}/{file_path}"
print(f"Fetching HTML file from: {raw_url}")

response = requests.get(raw_url)
if response.status_code != 200:
    print(f"Failed to fetch HTML file. Status code: {response.status_code}")
    print(f"Response: {response.text}")
    raise Exception("Could not download HTML file from GitHub")

local_html_path = os.path.join(output_dir, "ICIT.html")
with open(local_html_path, "wb") as file:
    file.write(response.content)
print(f"Downloaded HTML file to: {local_html_path}")

base_url = "https://www.indus.epigraphica.de/"

# Authentication details for the Indus website
AUTH_USERNAME = "icit"
AUTH_PASSWORD = "seal123"
AUTH_HEADER = {
    "Authorization": "Basic " + base64.b64encode(f"{AUTH_USERNAME}:{AUTH_PASSWORD}".encode()).decode()
}

print("Parsing HTML file...")
with open(local_html_path, "r", encoding="utf-8") as file:
    soup = BeautifulSoup(file, "html.parser")

rows = soup.find_all("tr")[1:]
print(f"Found {len(rows)} rows in the table")

sign_data = []
for row in rows:
    cols = row.find_all("td")

    if len(cols) < 12:
        continue

    sign_code = cols[0].text.strip()

    img_tag = cols[6].find("img")
    if img_tag:
        img_src = img_tag["src"]
        img_url = base_url + img_src

        # Include all signs regardless of frequency
        sign_data.append((sign_code, img_url))

print(f"Found {len(sign_data)} total signs")

print(f"Downloading {len(sign_data)} images...")

session = requests.Session()
session.headers.update(AUTH_HEADER)

for sign_code, img_url in tqdm(sign_data, desc="Downloading"):
    img_filename = os.path.join(output_dir, f"{sign_code}.jpg")

    response = session.get(img_url, stream=True)
    if response.status_code == 200:
        with open(img_filename, "wb") as img_file:
            img_file.write(response.content)
    else:
        print(f"Failed to download: {img_url} - Status {response.status_code}")

print("Download complete! All images are saved in:", output_dir)

downloaded_files = [f for f in os.listdir(output_dir) if f.endswith('.jpg')]
print(f"Downloaded {len(downloaded_files)} image files")
if downloaded_files:
    print(f"Sample files: {', '.join(downloaded_files[:5])}" +
          (f"... and {len(downloaded_files)-5} more" if len(downloaded_files) > 5 else ""))

Created directory: /content/drive/MyDrive/script_analysis/indus
Fetching HTML file from: https://raw.githubusercontent.com/oohalakkadi/signaturework/main/data/indus/ICIT.html
Downloaded HTML file to: /content/drive/MyDrive/script_analysis/indus/ICIT.html
Parsing HTML file...
Found 715 rows in the table
Found 715 total signs
Downloading 715 images...


Downloading:   0%|          | 0/715 [00:00<?, ?it/s]

Download complete! All images are saved in: /content/drive/MyDrive/script_analysis/indus
Downloaded 715 image files
Sample files: 1.jpg, 2.jpg, 3.jpg, 4.jpg, 5.jpg... and 710 more


## 2. Proto-Cuneiform Script Acquisition



Retrieval of the standardized Proto-Cuneiform sign corpus from CDLI GitHub repository.

- **Repository**: cdli-gh/proto-cuneiform_signs
- **Operation**: Clone repository and extract JPG images from archsigns directory
- **Destination**: Google Drive at /content/drive/MyDrive/script_analysis/proto_cuneiform/

Establishes the Proto-Cuneiform dataset.

In [None]:
!git clone https://github.com/cdli-gh/proto-cuneiform_signs.git
!mkdir -p /content/drive/MyDrive/script_analysis/proto_cuneiform/
!cp proto-cuneiform_signs/archsigns/*.jpg /content/drive/MyDrive/script_analysis/proto_cuneiform/

## 3. Proto-Elamite Script Acquisition

Retrieval of Proto-Elamite sign corpus from SFU Natural Language Lab repository.

- **Repository**: sfu-natlang/pe-decipher-toolkit
- **Operation**: Clone repository and extract PNG images from both main forms and numerical signs, sourced from CDLI
- **Sources**: PE_mainforms directory (standard signs) and PE_num directory (numerical notation)
- **Destination**: Google Drive at /content/drive/MyDrive/script_analysis/proto_elamite/

Establishes the Proto-Elamite dataset containing both standard and numerical signs.

In [None]:
!git clone https://github.com/sfu-natlang/pe-decipher-toolkit.git
!mkdir -p /content/drive/MyDrive/script_analysis/proto_elamite/
!cp pe-decipher-toolkit/pngs/PE_mainforms/*.png /content/drive/MyDrive/script_analysis/proto_elamite/
!cp pe-decipher-toolkit/pngs/PE_num/*.png /content/drive/MyDrive/script_analysis/proto_elamite/

## 4. Naxi Dongba Script Acquisition and Generation


Implementation of font-based character rendering pipeline for Naxi Dongba script dataset creation.

- **Source**: BabelStoneNaxiLLC.ttf font file containing standardized Dongba glyphs
- **Unicode Range**: U+E000 to U+E849 (2,122 Dongba characters in Private Use Area)
- **Rendering Process**:
  - Character-by-character rendering with centered positioning
  - Consistent 224×224px dimensions for neural network compatibility
  - PNG format with lossless compression
- **Visual Verification**: Sample character display with Unicode code point labeling
- **Naming Convention**: Unicode-based filenames (dongba_{hex_code}.png)

Creates programmatically generated Naxi Dongba glyph dataset with standardized rendering parameters for script comparison analysis.

In [None]:
def ensure_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Created directory: {directory}")
    else:
        print(f"Directory already exists: {directory}")

def download_font_from_github(output_path):
    """Download the TTF font file from GitHub repository"""

    repo_owner = "oohalakkadi"
    repo_name = "ivc2tyc"
    branch = "main"
    font_path = "reference/tyc/naxi/BabelStoneNaxiLLC.ttf"


    raw_url = f"https://raw.githubusercontent.com/{repo_owner}/{repo_name}/{branch}/{font_path}"
    print(f"Downloading font from: {raw_url}")


    response = requests.get(raw_url)
    if response.status_code != 200:
        print(f"Failed to fetch font file. Status code: {response.status_code}")
        raise Exception("Could not download font file from GitHub")


    with open(output_path, "wb") as file:
        file.write(response.content)
    print(f"Downloaded font to: {output_path}")
    return output_path

def char_to_image(char, font, image_size=224, bg_color=(255, 255, 255), text_color=(0, 0, 0)):
    img = Image.new('RGB', (image_size, image_size), color=bg_color)
    draw = ImageDraw.Draw(img)

    left, top, right, bottom = font.getbbox(char)
    text_width = right - left
    text_height = bottom - top

    # center text
    position = ((image_size - text_width) // 2 - left, (image_size - text_height) // 2 - top)

    draw.text(position, char, fill=text_color, font=font)

    return img


def create_dongba_dataset(font_path, output_dir, start_code=0xE000, end_code=0xE849, image_size=224):
    ensure_dir(output_dir)

    font_size = int(image_size * 0.7)
    try:
        font = ImageFont.truetype(font_path, font_size)
        print(f"Font loaded successfully: {font_path}")
    except Exception as e:
        print(f"Error loading font {font_path}: {e}")
        return

    char_list = [chr(code) for code in range(start_code, end_code + 1)]
    print(f"Will generate images for {len(char_list)} characters (U+{start_code:04X} to U+{end_code:04X})")


    font_name = os.path.splitext(os.path.basename(font_path))[0]


    plt.figure(figsize=(15, 3))
    sample_indices = np.linspace(0, len(char_list)-1, 5, dtype=int)

    for i, idx in enumerate(sample_indices):
        char = char_list[idx]
        img = char_to_image(char, font, image_size=image_size)
        plt.subplot(1, 5, i+1)
        plt.imshow(np.array(img))
        plt.title(f"U+{ord(char):04X}")
        plt.axis('off')

    plt.tight_layout()
    plt.show()


    for i, char in tqdm(enumerate(char_list), total=len(char_list), desc="Generating images"):
        try:

            char_code = f"{ord(char):04X}"
            filename = f"dongba_{char_code}.png"


            img = char_to_image(char, font, image_size=image_size)


            img.save(os.path.join(output_dir, filename))

        except Exception as e:
            print(f"Error processing character U+{ord(char):04X}: {e}")

    print(f"Created {len(char_list)} character images in {output_dir}")
    return output_dir


output_dir = '/content/drive/MyDrive/script_analysis/naxi_dongba/'
ensure_dir(output_dir)


temp_font_path = '/content/BabelStoneNaxiLLC.ttf'


downloaded_font_path = download_font_from_github(temp_font_path)


create_dongba_dataset(
    font_path=downloaded_font_path,
    output_dir=output_dir,
    start_code=0xE000,
    end_code=0xE849,
    image_size=224
)

# Count generated files as verification
generated_files = [f for f in os.listdir(output_dir) if f.startswith('dongba_') and f.endswith('.png')]
print(f"\nGeneration complete! {len(generated_files)} images saved to: {output_dir}")

# Force sync to Google Drive
print("Syncing files to Google Drive...")
try:
    with open('/content/drive/MyDrive/__force_sync__', 'w') as f:
        f.write('')
    os.remove('/content/drive/MyDrive/__force_sync__')
except:
    pass  # Ignore errors with the sync file
print("Sync complete.")

## 5. Old Naxi (Dongba) Script Acquisition


Code is in **old_naxi.ipynb**.

## 6. TYC Scripts Acquisition

### Ba-Shu Dataset Acquisition (symbols transcribed from Sanxingdui Museum)

Fetches and stores Ba-Shu script dataset from GitHub to Google Drive.  

In [None]:
def ensure_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
    print(f"Directory ready: {directory}")

drive_bashu_dir = '/content/drive/MyDrive/script_analysis/ba-shu/'
ensure_dir(drive_bashu_dir)

repo_owner = "oohalakkadi"
repo_name = "ivc2tyc"
branch = "main"
folder_path = "reference/tyc/ba-shu"

api_url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/contents/{folder_path}?ref={branch}"
print(f"Fetching file list from: {api_url}")

response = requests.get(api_url)
if response.status_code != 200:
    print(f"Failed to fetch repository contents. Status code: {response.status_code}")
    print(f"Response: {response.text}")
else:
    files = json.loads(response.text)
    print(f"Found {len(files)} files in the repository folder")

    for file in files:
        if file['type'] == 'file':
            file_url = file['download_url']
            file_name = file['name']

            print(f"Downloading: {file_name}")

            file_response = requests.get(file_url)
            if file_response.status_code == 200:

                destination_path = os.path.join(drive_bashu_dir, file_name)
                with open(destination_path, 'wb') as f:
                    f.write(file_response.content)
                print(f"Saved: {file_name}")
            else:
                print(f"Failed to download {file_name}. Status code: {file_response.status_code}")

    print("Syncing files to Google Drive...")
    with open('/content/drive/MyDrive/__force_sync__', 'w') as f:
        f.write('')
    os.remove('/content/drive/MyDrive/__force_sync__')

    downloaded_files = os.listdir(drive_bashu_dir)
    print(f"\nDownload complete! {len(downloaded_files)} files saved in: {drive_bashu_dir}")
    print(f"Files: {', '.join(downloaded_files[:5])}" +
          (f"... and {len(downloaded_files)-5} more" if len(downloaded_files) > 5 else ""))

### Classical Yi Data Acquisition

Downloads, extracts, and stores Yi script images  
- Retrieves a ZIP dataset from source.
- Extracts one PNG file per subfolder.

In [None]:
drive.mount('/content/drive')
if not os.path.exists('/content/drive/MyDrive'):
    print("ERROR: Google Drive was not properly mounted!")
    exit(1)
else:
    print("Google Drive successfully mounted")

yi_dir = "/content/drive/MyDrive/script_analysis/yi/"
try:
    os.makedirs(yi_dir, exist_ok=True)
    print(f"Directory created or already exists at: {yi_dir}")
except Exception as e:
    print(f"ERROR creating directory: {e}")
    exit(1)

url = "https://download.scidb.cn/download?fileId=8f5feddd6987cf95a887c80e6de44c0a&path=/V2/data.zip&fileName=data.zip"

print("Downloading ZIP file...")
try:
    response = requests.get(url, stream=True)
    response.raise_for_status()
    print("Download successful")
except Exception as e:
    print(f"ERROR downloading file: {e}")
    exit(1)


try:
    zip_file = zipfile.ZipFile(BytesIO(response.content))

    all_files = zip_file.namelist()
    print(f"ZIP contains {len(all_files)} files")
    print(f"First 5 files: {all_files[:5] if all_files else 'No files found'}")

    print("Extracting one PNG per subfolder...")
    subfolder_files = {}

    for file in all_files:
        if file.lower().endswith('.png'):
            folder = os.path.dirname(file)
            if folder not in subfolder_files:
                subfolder_files[folder] = []
            subfolder_files[folder].append(file)

    print(f"Found {len(subfolder_files)} folders with PNG files")

    selected_count = 0
    for folder, files in subfolder_files.items():
        chosen_file = random.choice(files)
        file_path = os.path.join(yi_dir, os.path.basename(chosen_file))

        try:
            with zip_file.open(chosen_file) as source, open(file_path, "wb") as target:
                target.write(source.read())
            selected_count += 1
            print(f"Saved: {file_path}")
        except Exception as e:
            print(f"ERROR saving file {chosen_file}: {e}")

    print(f"Extraction complete! {selected_count} images saved in: {yi_dir}")

    saved_files = os.listdir(yi_dir)
    print(f"Files in destination directory: {len(saved_files)}")
    if saved_files:
        print(f"First few files: {saved_files[:5]}")
    else:
        print("No files found in destination directory!")

except Exception as e:
    print(f"ERROR processing ZIP file: {e}")
    exit(1)

print("Syncing files to Google Drive...")
!cp /dev/null /content/drive/MyDrive/__force_sync__
print("Sync complete. Files should now be visible in your Google Drive.")

### TYC Data Merging

Sets up TYC directories, loads image paths, and counts script dataset images.  

In [None]:
YI_PATH = '/content/drive/MyDrive/script_analysis/yi/'
BA_SHU_PATH = '/content/drive/MyDrive/script_analysis/ba-shu/'
OLD_NAXI_PATH = '/content/drive/MyDrive/script_analysis/old_naxi/'
TYC_PATH = '/content/drive/MyDrive/script_analysis/tyc/'

def setup_directories():
    """Create the necessary directory structure for the TYC dataset"""
    os.makedirs(TYC_PATH, exist_ok=True)
    os.makedirs(os.path.join(TYC_PATH, 'yi'), exist_ok=True)
    os.makedirs(os.path.join(TYC_PATH, 'ba-shu'), exist_ok=True)
    os.makedirs(os.path.join(TYC_PATH, 'old_naxi'), exist_ok=True)
    print(f"Created TYC directory structure at {TYC_PATH}")


def load_image_paths(directory):
    """Loads all valid image paths from a directory"""
    image_paths = []
    valid_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tif', '.tiff']

    if not os.path.exists(directory):
        print(f"Warning: Directory {directory} does not exist!")
        return image_paths

    for root, _, files in os.walk(directory):
        for file in files:
            if any(file.lower().endswith(ext) for ext in valid_extensions):
                image_paths.append(os.path.join(root, file))

    return image_paths

setup_directories()

yi_files = load_image_paths(YI_PATH)
ba_shu_files = load_image_paths(BA_SHU_PATH)
old_naxi_files = load_image_paths(OLD_NAXI_PATH)

print(f"Original dataset counts:")
print(f"Yi: {len(yi_files)} images")
print(f"Ba-shu: {len(ba_shu_files)} images")
print(f"Old Naxi: {len(old_naxi_files)} images")
print(f"Target count per script: ~{len(yi_files)} images")

Processing Classical Yi

In [None]:
def process_yi_dataset():
    """
    Copy Yi dataset files to the TYC structure.
    No processing needed as these files are already in good condition.
    """
    yi_paths = load_image_paths(YI_PATH)
    target_dir = os.path.join(TYC_PATH, 'yi')

    print(f"Copying {len(yi_paths)} Yi images to {target_dir}...")

    for i, path in enumerate(tqdm(yi_paths)):
        filename = os.path.basename(path)

        # ensuring unique filenames
        target_path = os.path.join(target_dir, f"yi_{i:05d}_{filename}")

        shutil.copy2(path, target_path)

    print(f"Yi processing complete. {len(yi_paths)} images copied.")
    return len(yi_paths)

Processing Old Naxi

In [None]:
def load_image_paths(directory):
    """Loads all valid image paths from a directory"""
    image_paths = []
    valid_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tif', '.tiff']

    if not os.path.exists(directory):
        print(f"Warning: Directory {directory} does not exist!")
        return image_paths

    for root, _, files in os.walk(directory):
        for file in files:
            if any(file.lower().endswith(ext) for ext in valid_extensions):
                image_paths.append(os.path.join(root, file))

    return image_paths

def natural_variation(image):
    """
    Apply extremely subtle transformations that preserve the style
    and appearance of the original data
    """
    img = image.copy()

    # rotation
    angle = random.uniform(-2, 2)
    img = img.rotate(angle, resample=Image.BICUBIC, expand=False, fillcolor=(255, 255, 255))

    # shift
    width, height = img.size
    shift_x = random.uniform(-width * 0.01, width * 0.01)
    shift_y = random.uniform(-height * 0.01, height * 0.01)

    # scale
    scale = random.uniform(0.98, 1.02)

    # tiny affine
    img = TF.affine(
        img,
        angle=0,
        translate=(shift_x, shift_y),
        scale=scale,
        shear=0,
        fill=(255, 255, 255)
    )

    # brightness
    if random.random() < 0.3:
        factor = random.uniform(0.97, 1.03)
        enhancer = ImageEnhance.Brightness(img)
        img = enhancer.enhance(factor)

    return img

def process_old_naxi_dataset(target_count=None):
    """
    Process Old Naxi dataset files:
    1. Copy to the TYC structure
    2. Add natural variations to reach target count if needed
    """
    old_naxi_paths = load_image_paths(OLD_NAXI_PATH)
    target_dir = os.path.join(TYC_PATH, 'old_naxi')

    os.makedirs(target_dir, exist_ok=True)

    if target_count is None:
        yi_files = load_image_paths(YI_PATH)
        target_count = len(yi_files)

    print(f"Processing {len(old_naxi_paths)} Old Naxi images...")

    processed_files = []
    for i, path in enumerate(tqdm(old_naxi_paths)):
        try:
            img = Image.open(path).convert('RGB')
            filename = os.path.basename(path)
            target_path = os.path.join(target_dir, f"old_naxi_orig_{i:05d}_{filename}")
            img.save(target_path)
            processed_files.append(target_path)
        except Exception as e:
            print(f"Error processing {path}: {e}")


    num_variations = max(0, target_count - len(processed_files))

    if num_variations > 0:
        print(f"Need to generate {num_variations} additional Old Naxi variations to reach target count")
        create_variations(processed_files, target_dir, num_variations, prefix="old_naxi_var")

    final_files = load_image_paths(target_dir)
    print(f"Old Naxi processing complete. {len(final_files)} images processed/generated.")
    return len(final_files)

def create_variations(image_paths, target_dir, num_to_generate, prefix="var"):
    """Create natural variations of the provided images"""
    source_images = []
    for path in image_paths:
        try:
            img = Image.open(path).convert('RGB')
            source_images.append(img)
        except Exception as e:
            print(f"Error loading {path}: {e}")

    if not source_images:
        print("No images to create variations from!")
        return 0

    print(f"Generating {num_to_generate} natural variations...")
    for i in tqdm(range(num_to_generate)):

        base_img = random.choice(source_images).copy()

        varied_img = natural_variation(base_img)

        save_path = os.path.join(target_dir, f"{prefix}_{i:05d}.png")
        varied_img.save(save_path)

    return num_to_generate

Processing Ba-Shu

In [None]:
def load_image_paths(directory):
    """Loads all valid image paths from a directory"""
    image_paths = []
    valid_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tif', '.tiff']

    if not os.path.exists(directory):
        print(f"Warning: Directory {directory} does not exist!")
        return image_paths

    for root, _, files in os.walk(directory):
        for file in files:
            if any(file.lower().endswith(ext) for ext in valid_extensions):
                image_paths.append(os.path.join(root, file))

    return image_paths

def natural_variation(image):
    """
    Apply extremely subtle transformations that preserve the style
    and appearance of the original data
    """
    img = image.copy()

    # rotation
    angle = random.uniform(-2, 2)
    img = img.rotate(angle, resample=Image.BICUBIC, expand=False, fillcolor=(255, 255, 255))

    # shift
    width, height = img.size
    shift_x = random.uniform(-width * 0.01, width * 0.01)
    shift_y = random.uniform(-height * 0.01, height * 0.01)

    # scale
    scale = random.uniform(0.98, 1.02)

    # tiny affine
    img = TF.affine(
        img,
        angle=0,
        translate=(shift_x, shift_y),
        scale=scale,
        shear=0,
        fill=(255, 255, 255)
    )

    # brightness
    if random.random() < 0.3:
        factor = random.uniform(0.97, 1.03)
        enhancer = ImageEnhance.Brightness(img)
        img = enhancer.enhance(factor)

    return img

def process_ba_shu_dataset(target_count=None):
    """Process Ba-shu dataset with subtle natural variations"""
    ba_shu_paths = load_image_paths(BA_SHU_PATH)
    target_dir = os.path.join(TYC_PATH, 'ba-shu')

    os.makedirs(target_dir, exist_ok=True)

    if target_count is None:
        yi_files = load_image_paths(YI_PATH)
        target_count = len(yi_files)

    print(f"Copying {len(ba_shu_paths)} original Ba-shu images...")

    for i, path in enumerate(tqdm(ba_shu_paths)):
        filename = os.path.basename(path)
        target_path = os.path.join(target_dir, f"ba_shu_orig_{i:05d}_{filename}")
        shutil.copy2(path, target_path)

    num_variations = max(0, target_count - len(ba_shu_paths))

    if num_variations == 0:
        print("No need for additional variations, Ba-shu dataset has enough samples.")
        return len(ba_shu_paths)

    print(f"Generating {num_variations} additional Ba-shu variations...")

    original_images = []
    for path in ba_shu_paths:
        try:
            img = Image.open(path).convert('RGB')
            original_images.append(img)
        except Exception as e:
            print(f"Error loading {path}: {e}")

    if not original_images:
        print("No images could be loaded for variations")
        return len(ba_shu_paths)

    print(f"Successfully loaded {len(original_images)} images for creating variations")

    batch_size = min(100, num_variations)

    total_generated = 0
    rounds = 0

    # generate variations in rounds
    while total_generated < num_variations:
        rounds += 1
        print(f"Starting variation round {rounds}...")

        to_generate = min(batch_size, num_variations - total_generated)

        for i in tqdm(range(to_generate)):
            base_img = random.choice(original_images).copy()

            varied = natural_variation(base_img)

            save_path = os.path.join(target_dir, f"ba_shu_var_{total_generated:05d}.png")
            varied.save(save_path)
            total_generated += 1

    total_images = len(ba_shu_paths) + total_generated
    print(f"Ba-shu processing complete. {total_images} total images created.")
    return total_images

def show_ba_shu_samples():
    ba_shu_dir = os.path.join(TYC_PATH, 'ba-shu')
    if not os.path.exists(ba_shu_dir):
        print("Ba-shu directory not found")
        return

    files = load_image_paths(ba_shu_dir)
    if not files:
        print("No Ba-shu files found")
        return

    orig_files = [f for f in files if 'orig' in f]
    var_files = [f for f in files if 'var' in f]

    if not var_files:
        var_files = [f for f in files if 'aug' in f]

    plt.figure(figsize=(15, 10))

    num_samples = min(4, len(orig_files))
    num_var_samples = min(8, len(var_files))

    if orig_files:
        for i, path in enumerate(random.sample(orig_files, num_samples)):
            img = Image.open(path).convert('RGB')
            plt.subplot(3, 4, i+1)
            plt.imshow(img)
            plt.title(f"Original {i+1}")
            plt.axis('off')

    if var_files:
        for i, path in enumerate(random.sample(var_files, num_var_samples)):
            img = Image.open(path).convert('RGB')
            plt.subplot(3, 4, i+num_samples+1)
            plt.imshow(img)
            plt.title(f"Variation {i+1}")
            plt.axis('off')

    plt.tight_layout()
    plt.show()

Processes and visualizes the combined TYC dataset  
- Standardizes, augments, and balances Yi, Ba-Shu, and Old Naxi datasets.  
- Generates a summary of dataset sizes and saves processed images.  
- Displays sample images from each script category.  


In [None]:
def process_all_datasets():
    """Process all datasets and create the combined TYC dataset"""
    print("Processing Yi dataset...")
    yi_count = process_yi_dataset()

    print("\nProcessing Old Naxi dataset...")
    old_naxi_count = process_old_naxi_dataset()

    print("\nProcessing Ba-shu dataset...")
    ba_shu_count = process_ba_shu_dataset(yi_count)

    # final counts
    final_yi_count = len(load_image_paths(os.path.join(TYC_PATH, 'yi')))
    final_ba_shu_count = len(load_image_paths(os.path.join(TYC_PATH, 'ba-shu')))
    final_old_naxi_count = len(load_image_paths(os.path.join(TYC_PATH, 'old_naxi')))

    print("\n=== TYC Dataset Summary ===")
    print(f"Yi: {final_yi_count} images")
    print(f"Ba-shu: {final_ba_shu_count} images")
    print(f"Old Naxi: {final_old_naxi_count} images")
    print(f"Total: {final_yi_count + final_ba_shu_count + final_old_naxi_count} images")
    print(f"Dataset saved to: {TYC_PATH}")

    show_samples()

def show_samples():
    """Show sample images from each processed dataset"""
    plt.figure(figsize=(15, 8))

    # yi samples
    yi_files = load_image_paths(os.path.join(TYC_PATH, 'yi'))
    if yi_files:
        for i, path in enumerate(random.sample(yi_files, min(3, len(yi_files)))):
            img = Image.open(path).convert('RGB')
            plt.subplot(3, 3, i+1)
            plt.imshow(img)
            plt.title(f"Yi {i+1}")
            plt.axis('off')

    # ba-shu samples
    ba_shu_files = load_image_paths(os.path.join(TYC_PATH, 'ba-shu'))
    if ba_shu_files:
        for i, path in enumerate(random.sample(ba_shu_files, min(3, len(ba_shu_files)))):
            img = Image.open(path).convert('RGB')
            plt.subplot(3, 3, i+4)
            plt.imshow(img)
            plt.title(f"Ba-shu {i+1}")
            plt.axis('off')

    # old naxi samples
    old_naxi_files = load_image_paths(os.path.join(TYC_PATH, 'old_naxi'))
    if old_naxi_files:
        for i, path in enumerate(random.sample(old_naxi_files, min(3, len(old_naxi_files)))):
            img = Image.open(path).convert('RGB')
            plt.subplot(3, 3, i+7)
            plt.imshow(img)
            plt.title(f"Old Naxi {i+1}")
            plt.axis('off')

    plt.tight_layout()
    plt.show()

process_all_datasets()