<a href="https://colab.research.google.com/github/mary-lev/manzoni_in_chinese/blob/main/claude_chinese_OCR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install anthropic

Collecting anthropic
  Downloading anthropic-0.49.0-py3-none-any.whl.metadata (24 kB)
Downloading anthropic-0.49.0-py3-none-any.whl (243 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.4/243.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: anthropic
Successfully installed anthropic-0.49.0


In [4]:
import os
import base64
import argparse
from tqdm import tqdm
import anthropic
from PIL import Image
import io

def encode_image_to_base64(image_path):
    """Convert an image to base64 encoding"""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

In [5]:
def ocr_with_claude(client, image_path, model="claude-3-7-sonnet-20250219"):
    """
    Send an image to Claude API for OCR processing using the Anthropic Python client

    Args:
        client: Anthropic API client
        image_path: Path to the image file
        model: Claude model to use

    Returns:
        Extracted text from the image
    """
    base64_image = encode_image_to_base64(image_path)

    message = client.messages.create(
        model=model,
        max_tokens=4000,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Please OCR the Chinese text in this image. Return only the extracted text, with proper spacing and paragraph breaks. Do not include any explanations or comments."
                    },
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": "image/jpeg",
                            "data": base64_image
                        }
                    }
                ]
            }
        ]
    )

    # The response content will be a list, we want the first text item
    if message.content:
        for content_block in message.content:
            if content_block.type == "text":
                return content_block.text

    return None


In [8]:
from google.colab import userdata

client = anthropic.Anthropic(api_key=userdata.get('CLAUDE_API_KEY'))
image_path = "page33.jpg"

extracted_text = ocr_with_claude(client, image_path)

In [9]:
extracted_text

'第二章\n\n据说,孔德王子在罗科洛伊①一战的前夜酣睡了一宿:其一，因为他感到疲惫不堪;其二,他事先早就作好了一切必要的部署，并拟定了翌晨的作战方案。然而,堂·安保迪却仅仅知道,次日将有一场激烈的战斗,故心中极度不安,几乎整夜都在考虑对策。显然,汉视警告和威胁,照样主持婚礼的做法,他甚至连想都没敢想。对兰佐讲明实情,然后同他一起商量对付的办法……天主不容！那两个打手中的一个曾对他说过："不许走漏半点风声……否则……嗯！"想至此,那个"嗯"字的菱鸣声再一次在堂·安保迪的耳畔作响,吓得他根本不敢考虑违抗命令,他甚至后悔同佩尔贝多亚谈起过此事。一走了事？可是去哪里呢？然后怎么办？困难重重,谈何容易！许许多多主意被可怜的教士一一否定,他躺在床上辗转反侧,难以入睡。他觉得,不管从哪一个方面分析,争取时间故意拖延兰佐的婚事万是上策。他突然想起,离开定下举办婚礼的日子已没剩下几天了,如果能将兰佐这小子稳住,他就有两个月得以喘息的时间,在两个月里,事情可能会出现重要的转机。他再三琢磨,尽力想找一些借口搪塞兰佐。尽管那些借口似乎都显得有一点苍白,但他转眼一想心中又觉坦然,因为他自身的名望无形中给他的借口增加了不少分量,他丰富的阅历也为他制服一个无\n\n① 今法国东北部一港口,一六四三年曾是法国彼劳芳王朝孔德家族同西班牙人作战的战场。\n\n18'

In [19]:
all_text = []

In [20]:
if extracted_text:
  # Save the raw OCR result for this page
  page_filename = f"page_{os.path.basename(image_path).split('.')[0]}.txt"
  with open(page_filename, 'w', encoding='utf-8') as f:
    f.write(extracted_text)
  all_text.append(extracted_text)
  print(f"Saved raw OCR to {page_filename}")

Saved raw OCR to page_page33.txt


In [36]:
all_text

['第二章\n\n据说,孔德王子在罗科洛伊①一战的前夜酣睡了一宿:其一，因为他感到疲惫不堪;其二,他事先早就作好了一切必要的部署，并拟定了翌晨的作战方案。然而,堂·安保迪却仅仅知道,次日将有一场激烈的战斗,故心中极度不安,几乎整夜都在考虑对策。显然,汉视警告和威胁,照样主持婚礼的做法,他甚至连想都没敢想。对兰佐讲明实情,然后同他一起商量对付的办法……天主不容！那两个打手中的一个曾对他说过："不许走漏半点风声……否则……嗯！"想至此,那个"嗯"字的菱鸣声再一次在堂·安保迪的耳畔作响,吓得他根本不敢考虑违抗命令,他甚至后悔同佩尔贝多亚谈起过此事。一走了事？可是去哪里呢？然后怎么办？困难重重,谈何容易！许许多多主意被可怜的教士一一否定,他躺在床上辗转反侧,难以入睡。他觉得,不管从哪一个方面分析,争取时间故意拖延兰佐的婚事万是上策。他突然想起,离开定下举办婚礼的日子已没剩下几天了,如果能将兰佐这小子稳住,他就有两个月得以喘息的时间,在两个月里,事情可能会出现重要的转机。他再三琢磨,尽力想找一些借口搪塞兰佐。尽管那些借口似乎都显得有一点苍白,但他转眼一想心中又觉坦然,因为他自身的名望无形中给他的借口增加了不少分量,他丰富的阅历也为他制服一个无\n\n① 今法国东北部一港口,一六四三年曾是法国彼劳芳王朝孔德家族同西班牙人作战的战场。\n\n18',
 '知的青年提供了有利的条件。他自言自语道："让我们来分析一下，他心中惦记着恋人，可是我考虑的是保住自己的性命。显然，那件事情同我的切身利益有更大的利害关系，再说我要比他精明得多。亲爱的孩子，如果说你迫不及待地想结婚，我不愿多加评论，但是我可不想为你火中取栗。"打定主意后，他的心情才稍稍平静下来，并终于合上了眼睛。在断断续续的睡眠中，做了一连串支离破碎的恶梦，看到了打手，棠·罗德里戈，兰佐，乡间小道，悬崖峭壁，逃跑，追踪，也听到了尖叫声和劈劈啪啪的枪声。\n\n对刚遭遇过灾难或不幸的人来说，梦醒时分是最令他伤感的时刻。当头脑一旦清醒过来，他的注意力总是回到往日宁静生活的回忆上。寰然间，迥异的现实如同不速之客一样鲁莽地闯入他的脑海，在瞬间形成的强烈反差让他深感痛心。在品尝了剧那间的苦涩之后，堂·安保迪立即在脑子里重新又过了一遍前天夜里定下的种种设想，并在进一步确定行动计划之前，就一些具体的做法作了补充。这时，他才走下床，忧心忡忡和煨躁

In [None]:
combined_text = "\n\n".join(all_text)

In [55]:
for x in range(175, 216):
  image_path = f"page{x}.jpg"
  extracted_text = ocr_with_claude(client, image_path)
  if extracted_text:
    # Save the raw OCR result for this page
    page_filename = f"page_{os.path.basename(image_path).split('.')[0]}.txt".replace("page_page", "page")
    page_path = os.path.join("text", page_filename)
    with open(page_path, 'w', encoding='utf-8') as f:
      f.write(extracted_text)
    all_text.append(extracted_text)
    print(f"Saved raw OCR to {page_filename}")


Saved raw OCR to page175.txt
Saved raw OCR to page176.txt
Saved raw OCR to page177.txt
Saved raw OCR to page178.txt
Saved raw OCR to page179.txt
Saved raw OCR to page180.txt
Saved raw OCR to page181.txt
Saved raw OCR to page182.txt
Saved raw OCR to page183.txt
Saved raw OCR to page184.txt
Saved raw OCR to page185.txt
Saved raw OCR to page186.txt
Saved raw OCR to page187.txt
Saved raw OCR to page188.txt
Saved raw OCR to page189.txt
Saved raw OCR to page190.txt
Saved raw OCR to page191.txt
Saved raw OCR to page192.txt
Saved raw OCR to page193.txt
Saved raw OCR to page194.txt
Saved raw OCR to page195.txt
Saved raw OCR to page196.txt
Saved raw OCR to page197.txt
Saved raw OCR to page198.txt
Saved raw OCR to page199.txt
Saved raw OCR to page200.txt
Saved raw OCR to page201.txt
Saved raw OCR to page202.txt
Saved raw OCR to page203.txt
Saved raw OCR to page204.txt
Saved raw OCR to page205.txt
Saved raw OCR to page206.txt
Saved raw OCR to page207.txt
Saved raw OCR to page208.txt
Saved raw OCR 

In [56]:
import zipfile
source_dir = os.path.abspath("text")

with zipfile.ZipFile("pages.zip", 'w', zipfile.ZIP_DEFLATED) as zipf:
        # Walk through all files and subdirectories
        for root, dirs, files in os.walk("text"):
            for file in files:
                file_path = os.path.join(root, file)
                # Calculate the relative path for the archive structure
                arcname = os.path.relpath(file_path, os.path.dirname(source_dir))
                zipf.write(file_path, arcname)