In [1]:
import sys
import os
from pathlib import Path
from hydra import initialize, compose
initialize(config_path="../confs", job_name="notebook")

sys.path.append(os.path.abspath("../src"))
# We change the working directory to the root of the project
# Run this only once
root_path = Path.cwd().parent
os.chdir(root_path)
sys.path.append(root_path / "src")

The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  initialize(config_path="../confs", job_name="notebook")


In [2]:
from src.utils import torch_to_pil
from src.character_linking.feature_matching import featureMatching
from src.character_linking.params import HOGParameters, featureMatchingParameters, fullHOGOutput, featureMatchingOutputs
from src.utils import connectedComponent
from src.patch_processing.patch_extraction import extract_patches

from notebook_utils.descriptor import compute_hog, visualize_hog

from einops import rearrange
import torch

import cv2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tqdm

from PIL import Image

In [3]:
from notebook_utils.parquet_utils import load_dataframe, save_dataframe

patches_df = load_dataframe('data/processed/book1_columnwise')

Loading column: bin_patch
Loading column: img_patch
Loading column: page
Loading column: file
Loading column: left
Loading column: top
Loading column: width
Loading column: height
Loading column: svg
Loading column: aspect_ratio
Loading column: predicted_char
Loading column: histogram
✓ Loaded from data/processed/book1_columnwise


In [6]:
import os
import json
import threading
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed

from openai import OpenAI
from tqdm import tqdm

# ---------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------

CACHE_PATH = "deepseek_char_cache.json"
MAX_WORKERS = 6  # reduce if you hit rate limits

# ---------------------------------------------------------------------
# Load / initialize cache
# ---------------------------------------------------------------------

_CACHE_LOCK = threading.Lock()

if os.path.exists(CACHE_PATH):
    with open(CACHE_PATH, "r", encoding="utf-8") as f:
        _CHAR_CACHE = json.load(f)
else:
    _CHAR_CACHE = {}

def _save_cache():
    with open(CACHE_PATH, "w", encoding="utf-8") as f:
        json.dump(_CHAR_CACHE, f, ensure_ascii=False, indent=2)

# ---------------------------------------------------------------------
# DeepSeek client
# ---------------------------------------------------------------------

client = OpenAI(
    api_key="sk-bc70ca511bb84365b9907211a59280cf",
    base_url="https://api.deepseek.com"
)

# ---------------------------------------------------------------------
# Cached API call
# ---------------------------------------------------------------------

def get_traditional_chinese_character_meaning(char: str) -> str:
    if not isinstance(char, str) or len(char) != 1:
        raise ValueError("Input must be a single Chinese character.")

    # ---- Cache hit ----
    with _CACHE_LOCK:
        if char in _CHAR_CACHE:
            return _CHAR_CACHE[char]

    # ---- Cache miss → API call ----
    prompt = (
        "Explain the meaning of the following Traditional Chinese character.\n\n"
        "Requirements:\n"
        "- Assume the character is Traditional Chinese\n"
        "- Give the primary meaning\n"
        "- Mention common usages or contexts\n"
        "- Briefly note the etymology or radicals if relevant\n"
        "- Keep the explanation concise and precise\n\n"
        f"Character: {char}"
    )

    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": "You are a knowledgeable Chinese linguistics assistant."},
            {"role": "user", "content": prompt},
        ],
        temperature=0.2,
        max_tokens=300,
    )

    answer = response.choices[0].message.content.strip()

    with _CACHE_LOCK:
        _CHAR_CACHE[char] = answer
        _save_cache()

    return answer

# ---------------------------------------------------------------------
# Cache warm-up over unique characters
# ---------------------------------------------------------------------

unique_chars = np.unique(patches_df["predicted_char"])

def _safe_worker(char):
    try:
        get_traditional_chinese_character_meaning(char)
    except Exception:
        pass

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = [
        executor.submit(_safe_worker, char)
        for char in unique_chars
        if char not in _CHAR_CACHE  # skip cached
    ]

    for _ in tqdm(as_completed(futures), total=len(futures)):
        pass


100%|██████████| 40/40 [00:00<00:00, 356962.04it/s]
