In [4]:
import os
from pdf2image import convert_from_path
from PIL import Image, ImageFilter
import pytesseract

# Path to your scanned textbook PDF
PDF_PATH = "/Users/minhajulhoque/work/github/mcps/genki_mcp/data/Genki Textbook 2nd Edition.pdf"

## Converting and Saving the Scanned PDF Images

In [None]:
# Convert PDF to images (you can batch this if memory is a concern)
print("Converting PDF pages to images...")
pages = convert_from_path(PDF_PATH, dpi=300)

print(f"Total pages: {len(pages)}")

In [None]:
# Save PIL Images under /data/pdf_images/
import os

def save_images_to_directory(images, output_dir="../data/pdf_images/"):
    os.makedirs(output_dir, exist_ok=True)  # Create directory if it doesn't exist

    for idx, img in enumerate(images):
        image_path = os.path.join(output_dir, f"page_{idx + 1}.png")
        img.save(image_path, format="PNG")
        print(f"Saved: {image_path}")

save_images_to_directory(pages)

## Regular Parsing

In [None]:
all_text = []

for i, page in enumerate(pages[:37]):  # Limit to first 27 pages
    print(f"OCR on page {i + 1}/{min(len(pages), 37)}")  # Adjust the print statement
    
    # Convert page to grayscale image
    gray_page = page.convert("L")

    # OCR to extract text
    text = pytesseract.image_to_string(gray_page)

    all_text.append(f"\n\n--- Page {i+1} ---\n{text}")

In [None]:
print(all_text[34])

## Language Specific Parsing

In [None]:
all_text = []

for i, page in enumerate(pages[:37]):  # Limit to first 27 pages
    print(f"OCR on page {i + 1}/{min(len(pages), 37)}")  # Adjust the print statement

    # Convert to grayscale
    img = page.convert("L")

    # Optional: Image cleanup (denoise, sharpen)
    img = img.filter(ImageFilter.SHARPEN)

    # OCR with English + Japanese
    text = pytesseract.image_to_string(img, lang='eng+jpn')

    all_text.append(f"\n\n--- Page {i+1} ---\n{text}")

In [None]:
print(all_text[34])

## Layout Parsing

In [None]:
all_text = []

for i, page in enumerate(pages[:37]):  # Limit to first 27 pages
    print(f"OCR on page {i + 1}/{min(len(pages), 37)}")  # Adjust the print statement

    img = page.convert("L")

    # Optional image cleanup
    img = img.filter(ImageFilter.SHARPEN)

    # Use layout preserving config
    text = pytesseract.image_to_string(img, lang='eng+jpn', config='--psm 4')

    all_text.append(f"\n\n--- Page {i+1} ---\n{text}")

In [None]:
print(all_text[30])

## LLM Parsing

### Run on A Single Page

In [19]:
import base64
from openai import OpenAI
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Initialize OpenAI client with API key from .env
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

# Path to your image
image_path = "/Users/minhajulhoque/work/github/mcps/genki_mcp/data/pdf_images/page_223.png"

GENKI_PDF_EXTRACTION_PROMPT = """Extract both English and Japanese text accurately from the Genki textbook. The goal is to preserve the original content in a way that allows a student to understand the core lesson without seeing the textbook itself.

Instructions:
1.	Text Extraction:
    • Extract all English text in clear, correct English.
    • Extract all Japanese text in proper Japanese (use correct kana, kanji, and grammar).
    • Do not translate between the two languages. Keep each language exactly as it appears.
2.	Formatting and Layout:
    • Keep the original structure, spacing, and formatting as close to the textbook as possible.
    • Example: If a sentence is on its own line in the book, keep it on its own line in the output.
    • Maintain bullet points, dialogue structure, headings, and numbered items if present.
    • This helps students easily follow along and compare content if needed.
3.	Clarity and Comprehension:
    • The extracted content should be clear and easy to understand for a beginner-level student.
    • A student should be able to read your output and grasp the main ideas and structure of the Genki lesson without confusion.
4.	Handling Diagrams and Images:
    • If the textbook includes diagrams, illustrations, or pictures:
    • Briefly describe them in text only if the description helps the student understand the content better.
    • Do not include descriptions if they add confusion or are unnecessary.
        
Notes:
	• Avoid spelling or grammar mistakes.
	• Do not paraphrase or summarize the content—extract it as-is.
	• Do not add any other commentary beside the extracted content.
	• Use your best judgment to decide whether describing a visual element is helpful.
"""

# Encode the image to base64
base64_image = encode_image(image_path)

# Create the chat completion request
completion = client.chat.completions.create(
    model="gpt-4.1-2025-04-14",  # gpt-4.1-mini-2025-04-14,  Use the correct full model name
    messages=[
        {
            "role": "user",
            "content": [
                { "type": "text", "text": GENKI_PDF_EXTRACTION_PROMPT },
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}",
                        "detail": "low"
                    },
                },
            ],
        }
    ],
    temperature=0,
)

# Print the output
print(completion.choices[0].message.content)

B. Pair Work—Ask if your partner has done . . . yet. If the answer is no, ask your partner out, as in the example. (If yes, ask your partner how it was, as in (2).)

Examples: the name of a newly released movie　「ワンピース」を見ました

A：　「ワンピース」を見ましたか。
B：　いいえ、まだです。
A：　じゃあ、いっしょに見に行きませんか。

A：　「ワンピース」を見ましたか。
B：　はい、見ました。
A：　どうでしたか。
B：　とてもおもしろかったです。

the name of a newly released movie
the name of a new restaurant/café
the name of a newly released song/music

天気がいいから、海（うみ）に行きます

A. Match up the phrases to make sense.

1. 今日は天気がいいから、　　　　　a. うちに帰ります。
2. たくさん勉強したから、　　　　　b. たくさん食べました。
3. おなかがすいていたから、　　　　c. ねます。
4. つかれていたから、　　　　　　　d. 海に行きます。

1 ＿＿＿　2 ＿＿＿　3 ＿＿＿　4 ＿＿＿


### Run on All Pages

In [None]:
import base64
import os
import json
from openai import OpenAI
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Function to encode image to base64
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


GENKI_PDF_EXTRACTION_PROMPT = """Extract both English and Japanese text accurately from the Genki textbook. The goal is to preserve the original content in a way that allows a student to understand the core lesson without seeing the textbook itself.

Instructions:
1.	Text Extraction:
    • Extract all English text in clear, correct English.
    • Extract all Japanese text in proper Japanese (use correct kana, kanji, and grammar).
    • Do not translate between the two languages. Keep each language exactly as it appears.
2.	Formatting and Layout:
    • Keep the original structure, spacing, and formatting as close to the textbook as possible.
    • Example: If a sentence is on its own line in the book, keep it on its own line in the output.
    • Maintain bullet points, dialogue structure, headings, and numbered items if present.
    • This helps students easily follow along and compare content if needed.
3.	Clarity and Comprehension:
    • The extracted content should be clear and easy to understand for a beginner-level student.
    • A student should be able to read your output and grasp the main ideas and structure of the Genki lesson without confusion.
4.	Handling Diagrams and Images:
    • If the textbook includes diagrams, illustrations, or pictures:
    • Briefly describe them in text only if the description helps the student understand the content better.
    • Do not include descriptions if they add confusion or are unnecessary.
        
Notes:
	• Avoid spelling or grammar mistakes.
	• Do not paraphrase or summarize the content—extract it as-is.
	• Do not add any other commentary beside the extracted content.
	• Use your best judgment to decide whether describing a visual element is helpful.
"""

# Paths
image_dir = "/Users/minhajulhoque/work/github/mcps/genki_mcp/data/pdf_images/"
output_path = "/Users/minhajulhoque/work/github/mcps/genki_mcp/output/extracted_text.json"

# Load existing extractions if they exist
if os.path.exists(output_path):
    with open(output_path, "r", encoding="utf-8") as f:
        extracted_pages = json.load(f)
else:
    extracted_pages = {}

start_from = 0
filtered_filenames = sorted(os.listdir(image_dir))[start_from:]

print("Starting from", filtered_filenames[0])

# Process all image files in directory
for filename in filtered_filenames:
    if filename.endswith(".png") or filename.endswith(".jpg"):
        page_number = os.path.splitext(filename)[0].replace("page_", "")

        # Skip if already processed
        if page_number in extracted_pages:
            print(f"Skipping page {page_number} (already processed).")
            continue

        image_path = os.path.join(image_dir, filename)
        print(f"Processing {filename}...")

        try:
            base64_image = encode_image(image_path)

            completion = client.chat.completions.create(
                model="gpt-4.1-2025-04-14",
                messages=[
                    {
                        "role": "user",
                        "content": [
                            { "type": "text", "text": GENKI_PDF_EXTRACTION_PROMPT },
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": f"data:image/jpeg;base64,{base64_image}",
                                },
                            },
                        ],
                    }
                ],
                temperature=0,
            )

            extracted_text = completion.choices[0].message.content
            extracted_pages[page_number] = {
                "filename": filename,
                "text": extracted_text
            }

            # Save after each successful page
            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(extracted_pages, f, ensure_ascii=False, indent=2)

        except Exception as e:
            print(f"❌ Error processing {filename}: {e}")

print(f"\n✅ Extraction finished. All progress saved to: {output_path}")

### Check which Pages Failed

In [18]:
import json

# Replace 'your_file.json' with the path to your JSON file
with open('/Users/minhajulhoque/work/github/mcps/genki_mcp/output/extracted_text.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# The number of pages is the number of top-level keys
number_of_pages = len(data)

if number_of_pages == 383:
    print("The number of pages is 383.")
else:
    print(f"The number of pages is {number_of_pages}, not 383.")
    
# Find missing pages
all_pages = set(str(i) for i in range(1, 384))
found_pages = set(data.keys())
missing_pages = sorted(int(p) for p in all_pages - found_pages)

if missing_pages:
    print(f"Missing pages: {missing_pages}")
else:
    print("No pages are missing.")

The number of pages is 383.
No pages are missing.
