In [1]:
# # split the pdf into separate pages

# import PyPDF2
# from src.document_generation import setup_logger
# import logging

# logger = logging.getLogger('logger_name')
# if logger.hasHandlers():
#     logger.handlers.clear()  # Clear existing handlers to avoid duplicates
# logger.setLevel(logging.DEBUG)
# handler = logging.StreamHandler()
# formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
# handler.setFormatter(formatter)
# logger.addHandler(handler)
# logger.propagate = False

# # Input PDF file path
# input_pdf_path = "../input_data/Der Weltkrieg v7 East Front.pdf"
# output_folder = "../input_data/Der Weltkrieg v7"

# # Ensure the output folder exists
# os.makedirs(output_folder, exist_ok=True)

# # Open the PDF and split pages
# try:
#     with open(input_pdf_path, 'rb') as file:
#         pdf_reader = PyPDF2.PdfReader(file)
#         num_pages = len(pdf_reader.pages)
#         logger.info(f"Extracted {num_pages} pages from PDF")

#         for i, page in enumerate(pdf_reader.pages):
#             # Create a new PDF writer for each page
#             pdf_writer = PyPDF2.PdfWriter()
#             pdf_writer.add_page(page)

#             # Save the current page to a new file
#             output_file_path = os.path.join(output_folder, f"page_{i+1:03d}.pdf")
#             with open(output_file_path, 'wb') as output_file:
#                 pdf_writer.write(output_file)
            
#             logger.debug(f"Saved page {i+1} to {output_file_path}")
#             if i == 10:
#                 break

#     logger.info(f"All pages have been split and saved to {output_folder}")

# except Exception as e:
#     logger.error(f"An error occurred: {e}")


In [1]:
%%time
%load_ext autoreload
%autoreload 2

# Standard Python modules
import time 
import numpy as np
import os
import sys
import json
import re
import glob
import asyncio
import aiohttp
import openai
import logging
from typing import Dict, Tuple, List, Callable
from dotenv import load_dotenv
from pathlib import Path

# Decorator to log wall time
def log_execution_time(func: Callable):
    async def wrapper(*args, **kwargs):
        start = time.time()
        result = await func(*args, **kwargs)
        logger.info(f"Finished {func.__name__} in {time.time() - start:.2f} seconds.")
        return result
    return wrapper

def get_missing_keys(raw_german_texts):
    # print missing keys
    missing_keys = [ key for key in sorted(all_pagenos) if key not in raw_german_texts or len(raw_german_texts[key]) < 10 ]
    return missing_keys

# Load environment variables from .env file
env_path = Path('../.env')  # Adjust path if needed
load_dotenv(dotenv_path=env_path)

# # Get the root path of the project
sys.path.append(os.path.abspath(".."))

# Display and plotting
from IPython.display import display, HTML, clear_output

# Project imports
from src.utils import timeit, encode_image, plt, pylab
from src.processing import compute_log_spectrum_1d, extract_image_bbox, save_images
from src.api_requests import construct_payload, process_single_page
from src.document_generation import save_document

# Set notebook display width
display(HTML("<style>.container { width:90% !important; }</style>"))

# Print Python environment info
print('sys.executable:', sys.executable)
print('sys.version:', sys.version, '\n')

# Setup for PDF processing
foldername = "Der Weltkrieg v8"

# OpenAI API setup
openai.api_key = os.getenv("OPENAI_API_KEY")
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {openai.api_key}"
}

# --------------------
# Initialize variables
# --------------------
plotter = False
image_path = f"../input_data/{foldername}/*pdf"
fnames = sorted(glob.glob(image_path))
all_pagenos = [re.search(r'page_(.*?)\.pdf', fname, re.DOTALL).group(1) for fname in fnames]

# Storage for processed texts
raw_german_texts: Dict[str, str] = {}
german_texts: Dict[str, str] = {}
english_texts: Dict[str, str] = {}

# ------------------
# Configure logging
# ------------------
logger = logging.getLogger("time_logger")
if logger.hasHandlers():
    logger.handlers.clear()  # Clear existing handlers to avoid duplicates

logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

# Don't propagate message to parent loggers
logger.propagate = False 


sys.executable: /Users/ozkansafak/code/fraktur/.venv/bin/python3
sys.version: 3.10.9 (main, Mar  1 2023, 12:20:14) [Clang 14.0.6 ] 

CPU times: user 754 ms, sys: 2.58 s, total: 3.33 s
Wall time: 624 ms


---
## Fraktur Translator

In [2]:
@log_execution_time
async def main(fnames, model_name="", semaphore_count=5, extract=True):
    semaphore = asyncio.Semaphore(semaphore_count)  # Adjust number based on API limits
    async def _process_page(fname: str) -> Tuple[str, Dict]:
        global raw_german_texts, german_texts, english_texts  # Explicitly declare globals
        async with semaphore:
            pageno = re.search(r'page_(.*?)\.pdf', fname, re.DOTALL).group(1)
            result = await process_single_page(fname, model_name, headers, plotter, pageno, extract) 
            return pageno, result
    
    # A list of coroutine objects. include only the unprocessed pages.
    keys = set(raw_german_texts.keys())
    tasks = []
    for fname in fnames:
        pageno = re.search(r'page_(.*?)\.pdf', fname, re.DOTALL).group(1)
        if pageno not in keys:
            tasks.append(_process_page(fname))
    logger.info(f"main: len(tasks): {len(tasks)} -- Process tasks as they complete")
    
    # Process tasks as they complete
    for i, completed_task in enumerate(asyncio.as_completed(tasks)):
        try:
            pageno, (content, raw_german_text, german_text, english_text) = await completed_task
            raw_german_texts[pageno] = raw_german_text
            german_texts[pageno] = german_text
            english_texts[pageno] = english_text
            logger.info(f" {i:3d} of {len(tasks)} -- Successfully processed page:{pageno}")
        except Exception as e:
            logger.error(f"{i:3d} of {len(tasks)} -- Error processing a task: {e}")
    return 

# Run the async code
await main(fnames, "gpt-4o-2024-08-06")


2024-12-20 22:59:08,922 - INFO - len(tasks): 487 -- Process tasks as they complete
2024-12-20 22:59:41,465 - INFO -    0 of 487 -- Successfully processed page:248
2024-12-20 22:59:43,202 - INFO -    1 of 487 -- Successfully processed page:247
2024-12-20 22:59:46,701 - INFO -    2 of 487 -- Successfully processed page:102
2024-12-20 22:59:48,211 - INFO -    3 of 487 -- Successfully processed page:103
2024-12-20 22:59:54,618 - INFO -    4 of 487 -- Successfully processed page:392
2024-12-20 23:00:07,250 - INFO -    5 of 487 -- Successfully processed page:104
2024-12-20 23:00:08,656 - INFO -    6 of 487 -- Successfully processed page:393
2024-12-20 23:00:19,530 - INFO -    7 of 487 -- Successfully processed page:249
2024-12-20 23:00:21,117 - INFO -    8 of 487 -- Successfully processed page:105
2024-12-20 23:00:23,338 - INFO -    9 of 487 -- Successfully processed page:394
2024-12-20 23:00:33,243 - INFO -   10 of 487 -- Successfully processed page:250
2024-12-20 23:00:41,234 - INFO -   11

##  Handle missing keys


In [18]:
# ---------------------
# 1. Rerun the missing pages on Claude
# ---------------------
missing_keys = set(get_missing_keys(raw_german_texts))
for key in missing_keys:
    del raw_german_texts[key]
print(f"missing_keys: {missing_keys}")
missed_fnames = []
for fname in fnames:
    pageno = re.search(r'page_(.*?)\.pdf', fname, re.DOTALL).group(1)
    if pageno in missing_keys:
        print(f"processing pageno: {pageno}")
        missed_fnames.append(fname)
print(f"missed_fnames: ")
[print(item) for item in missed_fnames]

await main(missed_fnames, model_name="claude-3-5-sonnet-20241022", semaphore_count=1, extract=True)

# ---------------------
# 2. If there still are missing pages, run them without performing FFT based extraction. 
#    This time compute missing_keys based on 'english_texts'.
# ---------------------
missing_keys = set(get_missing_keys(english_texts))
for key in missing_keys:
    del raw_german_texts[key]
print(f"missing_keys: {missing_keys}")
missed_fnames = []
for fname in fnames:
    pageno = re.search(r'page_(.*?)\.pdf', fname, re.DOTALL).group(1)
    if pageno in missing_keys:
        print(f"processing pageno: {pageno}")
        missed_fnames.append(fname)
print(f"missed_fnames: ")
[print(item) for item in missed_fnames]

await main(missed_fnames, model_name="claude-3-5-sonnet-20241022", semaphore_count=1, extract=False)


2024-12-21 00:01:09,804 - INFO - len(tasks): 7 -- Process tasks as they complete


missing_keys: {'476', '008', '004', '006', '228', '316', '007'}
processing pageno: 004
processing pageno: 006
processing pageno: 007
processing pageno: 008
processing pageno: 228
processing pageno: 316
processing pageno: 476
missed_fnames: 
../input_data/Der Weltkrieg v8/page_004.pdf
../input_data/Der Weltkrieg v8/page_006.pdf
../input_data/Der Weltkrieg v8/page_007.pdf
../input_data/Der Weltkrieg v8/page_008.pdf
../input_data/Der Weltkrieg v8/page_228.pdf
../input_data/Der Weltkrieg v8/page_316.pdf
../input_data/Der Weltkrieg v8/page_476.pdf


2024-12-21 00:01:39,832 - INFO -    0 of 7 -- Successfully processed page:316
2024-12-21 00:01:48,913 - INFO -    1 of 7 -- Successfully processed page:476
2024-12-21 00:02:17,192 - INFO -    2 of 7 -- Successfully processed page:004
2024-12-21 00:02:33,103 - INFO -    3 of 7 -- Successfully processed page:008
2024-12-21 00:02:46,944 - INFO -    4 of 7 -- Successfully processed page:228
2024-12-21 00:03:01,580 - INFO -    5 of 7 -- Successfully processed page:006
2024-12-21 00:03:22,571 - INFO -    6 of 7 -- Successfully processed page:007
2024-12-21 00:03:22,574 - INFO - Finished main in 132.77 seconds.
2024-12-21 00:03:22,579 - INFO - len(tasks): 0 -- Process tasks as they complete
2024-12-21 00:03:22,581 - INFO - Finished main in 0.00 seconds.


missing_keys: set()
missed_fnames: 


In [39]:

missing_keys = ['018', '466', '480']
for key in missing_keys:
    del raw_german_texts[key]
print(f"missing_keys: {missing_keys}")

missed_fnames = []
for fname in fnames:
    pageno = re.search(r'page_(.*?)\.pdf', fname, re.DOTALL).group(1)
    if pageno in missing_keys:
        print(f"processing pageno: {pageno}")
        missed_fnames.append(fname)
print(f"missed_fnames: ")
[print(item) for item in missed_fnames]

await main(missed_fnames, model_name="claude-3-5-sonnet-20241022", semaphore_count=1, extract=False)


2024-12-21 00:20:52,739 - INFO - len(tasks): 3 -- Process tasks as they complete


missing_keys: ['018', '466', '480']
processing pageno: 018
processing pageno: 466
processing pageno: 480
missed_fnames: 
../input_data/Der Weltkrieg v8/page_018.pdf
../input_data/Der Weltkrieg v8/page_466.pdf
../input_data/Der Weltkrieg v8/page_480.pdf


2024-12-21 00:21:06,753 - INFO -    0 of 3 -- Successfully processed page:480
2024-12-21 00:21:23,108 - INFO -    1 of 3 -- Successfully processed page:466
2024-12-21 00:21:49,929 - INFO -    2 of 3 -- Successfully processed page:018
2024-12-21 00:21:49,932 - INFO - Finished main in 57.19 seconds.


In [42]:
# Save json files and .docx files.

from src.document_generation import save_document

# save json outputs
if not os.path.exists(f'../output_data/{foldername}'):
    os.makedirs(f'../output_data/{foldername}')

with open(f'../output_data/{foldername}/english_texts.json', 'w') as f:
    json.dump(english_texts, f)
with open(f'../output_data/{foldername}/german_texts.json', 'w') as f:
    json.dump(german_texts, f)
with open(f'../output_data/{foldername}/raw_german_texts.json', 'w') as f:
    json.dump(raw_german_texts, f)

doc1, fname1 = save_document(german_texts, foldername, language='German')
doc2, fname2 = save_document(english_texts, foldername, language='English')


``` 
1.  Upload Input folder of pdfs to blob storage.
2.  Read file from s3.
3.  FFT in y -> (x_hi, x_lo), write half_cropped_image to s3
4.  FFT in x -> (y_hi, y_lo), write cropped_image to s3
5.  Read cropped image from s3 -> encode_image -> translate and transcribe -> JSON output

```

### Available models and pricing:
```
"gpt-4o-2024-08-06":
    "price_txt": "$2.50 / 1M input tokens"
    "price_img": "$0.001913 / 1500px^2"
    
"gpt-4o-mini-2024-07-18":
    "price_txt": "$0.150 / 1M input tokens"
    "price_img": "$0.003825 / 1500px^2"
    
```

---
## Experiments

In [55]:
base64_image = None
payload = {
    "model": model_name,
    "messages": [
      {"role": "system", "content": "You have three roles. First of all you are a professional OCR assistant. "
       "Secondly, you identify the parts of your transcriptions to belong to header, body and footer sections. "
       "Lastly, you are a GERMAN to ENGLISH translator that stays loyal to the style and "
       "character of the original German text."
      },
      {
        "role": "user",
        "content": [
          {
            "type": "text",
            "text": 
"""Instructions:

You are to perform three steps on the provided image of a document.

**Step 1: OCR Transcription**

Task: Transcribe the entire text from the image into German, including all Fraktur characters.

Attention: Pay close attention to accurately capturing all text elements.

Attention 2: Make sure you're reading each line only once. 

Formatting: Wrap the entire transcription in <raw_german></raw_german> tags.

Caution: Pay attention to identify the paragraphs as a whole and not erroneously place a carriage return at the end of each line.

Separator: When you are done with Step 1, print the separator line:

--------------------------------------------------------------------
**Step 2: Header-Body-Footer Analysis**

Review: Look at the image and your transcription from Step 1.

Verification: Ensure you haven't missed any parts; if you did, transcribe and include them now.

Caution: Pay attention to identify the paragraphs as a whole and not erroneously place a carriage return at the end of each line.

Categorization:

Header: If you detect a header (e.g., chapter title or section heading), wrap it inside <header></header> tags. If there's no header, omit the <header></header> tags.
Body: Wrap the main body of the text inside <body></body> tags.
Footer: If you detect any footnotes, wrap them inside <footer></footer> tags. If there are no footnotes, omit the <footer></footer> tags.
Formatting: Wrap this structured transcription inside <german></german> tags.

Separator: When you are done with Step 2, print the separator line again:

--------------------------------------------------------------------
**Step 3: Translation (German to English)**

Task: Translate the structured German text from Step 2 into English.
Structure: Maintain the same <header>, <body>, and <footer> sections in your translation.
Formatting: Wrap the translated text inside <english></english> tags.

Example Output Format:

<raw_german>
... (transcribed German text) ...
</raw_german>
--------------------------------------------------------------------
<german>
<header> ... </header>
<body> ... </body>
<footer> ... </footer>
</german>
--------------------------------------------------------------------
<english>
<header> ... </header>
<body> ... </body>
<footer> ... </footer>
</english>"""                

          },
          {
            "type": "image_url",
            "image_url": {
              "url": f"data:image/jpeg;base64,{base64_image}"
            }
          }
        ]
      }
    ],
    "max_tokens": 6000,
    "temperature": 0.1
}


In [54]:
payload.keys()


dict_keys(['model', 'messages', 'max_tokens', 'temperature'])

In [78]:
payload['messages'][1]['content'][1]

{'type': 'image_url', 'image_url': {'url': 'data:image/jpeg;base64,None'}}

In [102]:
payload['messages'][1]['content'][1]

{'type': 'image_url', 'image_url': {'url': 'data:image/jpeg;base64,None'}}