In [1]:
# # split the pdf into separate pages

# import PyPDF2
# from src.document_generation import setup_logger
# import logging

# logger = logging.getLogger('logger_name')
# if logger.hasHandlers():
#     logger.handlers.clear()  # Clear existing handlers to avoid duplicates
# logger.setLevel(logging.DEBUG)
# handler = logging.StreamHandler()
# formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
# handler.setFormatter(formatter)
# logger.addHandler(handler)
# logger.propagate = False

# # Input PDF file path
# input_pdf_path = "../input_data/Der Weltkrieg v7 East Front.pdf"
# output_folder = "../input_data/Der Weltkrieg v7"

# # Ensure the output folder exists
# os.makedirs(output_folder, exist_ok=True)

# # Open the PDF and split pages
# try:
#     with open(input_pdf_path, 'rb') as file:
#         pdf_reader = PyPDF2.PdfReader(file)
#         num_pages = len(pdf_reader.pages)
#         logger.info(f"Extracted {num_pages} pages from PDF")

#         for i, page in enumerate(pdf_reader.pages):
#             # Create a new PDF writer for each page
#             pdf_writer = PyPDF2.PdfWriter()
#             pdf_writer.add_page(page)

#             # Save the current page to a new file
#             output_file_path = os.path.join(output_folder, f"page_{i+1:03d}.pdf")
#             with open(output_file_path, 'wb') as output_file:
#                 pdf_writer.write(output_file)
            
#             logger.debug(f"Saved page {i+1} to {output_file_path}")
#             if i == 10:
#                 break

#     logger.info(f"All pages have been split and saved to {output_folder}")

# except Exception as e:
#     logger.error(f"An error occurred: {e}")


In [2]:
%%time
%load_ext autoreload
%autoreload 2

# Standard Python modules
import time 
import numpy as np
import os
import sys
import json
import re
import glob
import asyncio
import aiohttp
import openai
import logging
from typing import Dict, Tuple, List, Callable
from dotenv import load_dotenv
from pathlib import Path

# Decorator to log wall time
def log_execution_time(func: Callable):
    async def wrapper(*args, **kwargs):
        start = time.time()
        result = await func(*args, **kwargs)
        logger.info(f"Finished {func.__name__} in {time.time() - start:.2f} seconds.")
        return result
    return wrapper

def get_missing_keys(raw_german_texts):
    # print missing keys
    missing_keys = [ key for key in sorted(all_pagenos) if key not in raw_german_texts or len(raw_german_texts[key]) < 10 ]
    return missing_keys

# Load environment variables from .env file
env_path = Path('../.env')  # Adjust path if needed
load_dotenv(dotenv_path=env_path)

# # Get the root path of the project
sys.path.append(os.path.abspath(".."))

# Display and plotting
from IPython.display import display, HTML, clear_output

# Project imports
from src.utils import timeit, encode_image, plt, pylab
from src.processing import compute_log_spectrum_1d, extract_image_bbox, save_images
from src.api_requests import construct_payload, process_single_page
from src.document_generation import save_document

# Set notebook display width
display(HTML("<style>.container { width:90% !important; }</style>"))

# Print Python environment info
print('sys.executable:', sys.executable)
print('sys.version:', sys.version, '\n')

# Setup for PDF processing
foldername = "Der Weltkrieg v7"

# OpenAI API setup
openai.api_key = os.getenv("OPENAI_API_KEY")
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {openai.api_key}"
}

# --------------------
# Initialize variables
# --------------------
plotter = False
image_path = f"../input_data/{foldername}/*pdf"
fnames = sorted(glob.glob(image_path))
all_pagenos = [re.search(r'page_(.*?)\.pdf', fname, re.DOTALL).group(1) for fname in fnames]

# Storage for processed texts
raw_german_texts: Dict[str, str] = {}
german_texts: Dict[str, str] = {}
english_texts: Dict[str, str] = {}

# ------------------
# Configure logging
# ------------------
logger = logging.getLogger("time_logger")
if logger.hasHandlers():
    logger.handlers.clear()  # Clear existing handlers to avoid duplicates

logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

# Don't propagate message to parent loggers
logger.propagate = False 


sys.executable: /Users/ozkansafak/code/fraktur/.venv/bin/python3
sys.version: 3.10.9 (main, Mar  1 2023, 12:20:14) [Clang 14.0.6 ] 

CPU times: user 898 ms, sys: 2.52 s, total: 3.42 s
Wall time: 712 ms


---
## Fraktur Translator (GPT-4o)

In [8]:
@log_execution_time
async def main(fnames, model_name="", semaphore_count=5, extract=True):
    semaphore = asyncio.Semaphore(semaphore_count)  # Adjust number based on API limits
    async def _process_page(fname: str) -> Tuple[str, Dict]:
        global raw_german_texts, german_texts, english_texts  # Explicitly declare globals
        async with semaphore:
            pageno = re.search(r'page_(.*?)\.pdf', fname, re.DOTALL).group(1)
            result = await process_single_page(fname, model_name, headers, plotter, pageno, extract) 
            return pageno, result
    
    # A list of coroutine objects. include only the unprocessed pages.
    keys = set(raw_german_texts.keys())
    tasks = []
    for fname in fnames:
        pageno = re.search(r'page_(.*?)\.pdf', fname, re.DOTALL).group(1)
        if pageno not in keys:
            tasks.append(_process_page(fname))
    logger.info(f"main: len(tasks): {len(tasks)} -- Processing tasks as they complete")
    
    # Process tasks as they complete
    for i, completed_task in enumerate(asyncio.as_completed(tasks)):
        try:
            pageno, (content, raw_german_text, german_text, english_text) = await completed_task
            raw_german_texts[pageno] = raw_german_text
            german_texts[pageno] = german_text
            english_texts[pageno] = english_text
            logger.info(f" {i:3d} of {len(tasks)} -- Successfully processed page:{pageno}")
        except Exception as e:
            logger.error(f"{i:3d} of {len(tasks)} -- Error processing a task: {e}")
    return 

# Run the async code
await main(fnames[5:8], model_name="gpt-4o-2024-08-06")


2024-12-21 19:06:53,617 - INFO - main: len(tasks): 2 -- Processing tasks as they complete
2024-12-21 19:07:23,058 - INFO -    0 of 2 -- Successfully processed page:005
2024-12-21 19:08:25,485 - INFO - Pageno: 006, "raw_german" section was not found
2024-12-21 19:08:25,487 - INFO - Pageno: 006, "german" section was not found
2024-12-21 19:08:25,488 - INFO - Pageno: 006, "english" section was not found
2024-12-21 19:08:25,494 - INFO -    1 of 2 -- Successfully processed page:006
2024-12-21 19:08:25,495 - INFO - Finished main in 91.88 seconds.


##  Handle missing keys (Claude Sonnet)


In [9]:
# ---------------------
# 1. Rerun the missing pages on Claude
# ---------------------
missing_keys = set(get_missing_keys(raw_german_texts))
for key in missing_keys:
    del raw_german_texts[key]
print(f"missing_keys: {missing_keys}")
missed_fnames = []
for fname in fnames:
    pageno = re.search(r'page_(.*?)\.pdf', fname, re.DOTALL).group(1)
    if pageno in missing_keys:
        print(f"processing pageno: {pageno}")
        missed_fnames.append(fname)
print(f"missed_fnames: ")
[print(item) for item in missed_fnames]

await main(missed_fnames, model_name="claude-3-5-sonnet-20241022", semaphore_count=1, extract=True)

# ---------------------
# 2. If there still are missing pages, run them without performing FFT based extraction. 
#    This time compute missing_keys based on 'english_texts'.
# ---------------------
missing_keys = set(get_missing_keys(english_texts))
for key in missing_keys:
    del raw_german_texts[key]
print(f"missing_keys: {missing_keys}")
missed_fnames = []
for fname in fnames:
    pageno = re.search(r'page_(.*?)\.pdf', fname, re.DOTALL).group(1)
    if pageno in missing_keys:
        print(f"processing pageno: {pageno}")
        missed_fnames.append(fname)
print(f"missed_fnames: ")
for item in missed_fnames:
    print(item) 

await main(missed_fnames, model_name="claude-3-5-sonnet-20241022", semaphore_count=1, extract=False)


2024-12-21 19:08:28,415 - INFO - main: len(tasks): 2 -- Processing tasks as they complete


missing_keys: {'006', '405'}
processing pageno: 006
processing pageno: 405
missed_fnames: 
../input_data/Der Weltkrieg v7/page_006.pdf
../input_data/Der Weltkrieg v7/page_405.pdf


2024-12-21 19:08:33,906 - INFO - Pageno: 405, "raw_german" section was not found
2024-12-21 19:08:33,908 - INFO - Pageno: 405, "german" section was not found
2024-12-21 19:08:33,909 - INFO - Pageno: 405, "english" section was not found
2024-12-21 19:08:35,270 - INFO -    0 of 2 -- Successfully processed page:405
2024-12-21 19:08:59,328 - INFO -    1 of 2 -- Successfully processed page:006
2024-12-21 19:08:59,329 - INFO - Finished main in 30.91 seconds.
2024-12-21 19:08:59,332 - INFO - main: len(tasks): 1 -- Processing tasks as they complete


missing_keys: {'405'}
processing pageno: 405
missed_fnames: 
../input_data/Der Weltkrieg v7/page_405.pdf


2024-12-21 19:09:03,858 - INFO - Pageno: 405, "raw_german" section was not found
2024-12-21 19:09:03,860 - INFO - Pageno: 405, "german" section was not found
2024-12-21 19:09:03,861 - INFO - Pageno: 405, "english" section was not found
2024-12-21 19:09:03,866 - INFO -    0 of 1 -- Successfully processed page:405
2024-12-21 19:09:03,867 - INFO - Finished main in 4.53 seconds.


In [10]:
# Save json files and .docx files.

from src.document_generation import save_document

# save json outputs
# if not os.path.exists(f'../output_data/{foldername}'):
#     os.makedirs(f'../output_data/{foldername}')
# with open(f'../output_data/{foldername}/english_texts.json', 'w') as f:
#     json.dump(english_texts, f)
# with open(f'../output_data/{foldername}/german_texts.json', 'w') as f:
#     json.dump(german_texts, f)
# with open(f'../output_data/{foldername}/raw_german_texts.json', 'w') as f:
#     json.dump(raw_german_texts, f)

doc1, fname1 = save_document(german_texts, foldername, language='German')
doc2, fname2 = save_document(english_texts, foldername, language='English')


``` 
1.  Upload Input folder of pdfs to blob storage.
2.  Read file from s3.
3.  FFT in y -> (x_hi, x_lo), write half_cropped_image to s3
4.  FFT in x -> (y_hi, y_lo), write cropped_image to s3
5.  Read cropped image from s3 -> encode_image -> translate and transcribe -> JSON output

```

### Available models and pricing:
```
"gpt-4o-2024-08-06":
    "price_txt": "$2.50 / 1M input tokens"
    "price_img": "$0.001913 / 1500px^2"
    
"gpt-4o-mini-2024-07-18":
    "price_txt": "$0.150 / 1M input tokens"
    "price_img": "$0.003825 / 1500px^2"
    
```

---
## Load the German text and translate broken sentences.

In [159]:
with open(f'../output_data/{foldername}/english_texts.json', 'r') as f:
    english_texts = json.load(f)
with open(f'../output_data/{foldername}/german_texts.json', 'r') as f:
    german_texts = json.load(f)
with open(f'../output_data/{foldername}/raw_german_texts.json', 'r') as f:
    raw_german_texts = json.load(f)


In [221]:
def construct_payload_2(german_page_contents, english_page_contents, model_name, i):
    payload = {
        "model": model_name,
        "messages": [
          {
            "role": "system", 
            "content": "You are a World War II historian and a professional translator from GERMAN to ENGLISH who "
              "stays loyal to both the style and the character of the original German text."
          },
          {
            "role": "user",
            "content": [
              {
                "type": "text",
                "text": f"<german_page_1>{german_page_contents[i]}</german_page_1>\n"
                  f"<german_page_2>{german_page_contents[i+1]}</german_page_2>\n" 
                  f"<english_page_1>{english_page_contents[i]}</english_page_1>\n" 
                  "---------------------------------------------------------------\n"
                  "I provided two succesive german pages from a World War II history book. "
                  "I had an AI assistant translate the german pages for me. "
                  "(I included the English translation of 'german_page_1' inside the 'english_page_1' tags. "
                  "Unfortunately, the translation was made in isolation from one another "
                  "--i.e. the AI assistant I used was only able to see one page at a time. "
                  "This is a problem because "
                  "if there's a sentence that spans over two successive german pages, the sentences are fragmented. "
                  "This means the AI assistant couldnt translate the german sentence coherently as a whole, and "
                  "it translated the fragmented parts separately. \n"
                  "\n"
                  "I want you to re-translate the content of 'german_page_1' from German to English.\n"
                  
                  "**Rule 1:** If the value inside the '<first_sentence_was_fragmented>' tags above is a 'True' value "
                  "then omit the first sentence at the beginning of the <body> section because it's not a whole sentence. "
                  "(cut off from the previous page), exclude it from the translation.\n"
                  
                  "**Rule 2:** If the last sentence of 'german_page_1' extends into 'german_page_2', "
                  "include this full sentence in the translation as if it were fully a part of 'german_page_1'. "
                  "And in this case, pay attention to only include this one sentence that spans over the end of 'german_page_1' "
                  "and beginning of 'german_page_2'. And don't engulf the next whole sentence from the 'german_page_2' into 'german_page_1'.\n"
                  
                  "**Rule 3:** Do not change the translation of the rest of the 'german_page_1'. For this purpose, I included its translation "
                  "in the tags <english_page_1></english_page_1> "
                  "which I want you to use as guidance, so you stay loyal to the style and linguistic characteristics and "
                  "you don't re-interprete or re-translate the intact sentences of 'german_page_1' \n"
                  
                  "**Rule 4:** Copy the '<header>' and '<footer>' sections from 'english_page_1' to 'german_page_1' exactly."
                  
                  "**Formatting:** "
                  "As mentioned above, if the last sentence of 'german_page_1' is fragmented (the end of the sentence extends into second page), "
                  "then start your output with these tags:\n"
                  "<last_sentence_is_fragmented>True</last_sentence_is_fragmented> "
                  "otherwise start your output with these tags:\n"
                  "<last_sentence_is_fragmented>False</last_sentence_is_fragmented>. "
                  "Then continue on with your task of translation and wrap the output in <english></english> tags.\n"
                  "Maintain the 'header' and 'footer' sections as is in the 'english_page_1'. \n"
              },
            ]
          }
        ],
        "max_tokens": 6000,
        "temperature": 0.1
    }
    return payload

In [222]:
german_page_contents = [german_texts[pageno] for pageno in all_pagenos]
english_page_contents = [english_texts[pageno] for pageno in all_pagenos]
payloads = []
for i in range(len(all_pagenos)-1):
    payloads.append(construct_payload_2(german_page_contents, english_page_contents, "gpt-4o-2024-08-06", i))


In [229]:
async def make_gpt_request_for_broken_sentences(model_name: str, headers: dict, payload: dict, pageno: str) -> dict:
    async with aiohttp.ClientSession() as session:
        global english_texts_2
        async with session.post(
            "https://api.openai.com/v1/chat/completions",
            json=payload,
            headers=headers
        ) as response:
            result = await response.json()
            content = result['choices'][0]['message']['content']
            english_texts_2[pageno] = re.search(f'<english>(.*?)</english>', content, re.DOTALL).group(1)
            return result

@log_execution_time
async def main_broken_sentences(model_name, headers, payloads):
    semaphore = asyncio.Semaphore(3)  # Adjust number based on API limits
    async def _process_page(model_name, headers, payload, pageno) -> dict:
        async with semaphore:
            await make_gpt_request_for_broken_sentences(model_name, headers, payload, pageno) 
            return pageno

    # A list of coroutine objects. include only the unprocessed pages.
    tasks = []
    for i in range(200, 215):
        tasks.append(_process_page(model_name, headers, payloads[i], all_pagenos[i])) 

    logger.info(f"main: len(tasks): {len(tasks)} -- Processing tasks as they complete") 

    # Process tasks as they complete
    for i, task in enumerate(tasks):
        try:
            pageno = await task
            logger.info(f" {i:3d} of {len(tasks)} -- Successfully processed pageno:{pageno}")
        except Exception as e:
            logger.error(f"{i:3d} of {len(tasks)} -- Error processing a task: {e}")

# Run the async code
english_texts_2 = {}
await main_broken_sentences("gpt-4o-2024-08-06", headers, payloads)


2024-12-27 03:56:28,596 - INFO - main: len(tasks): 15 -- Processing tasks as they complete
2024-12-27 03:56:39,015 - INFO -    0 of 15 -- Successfully processed pageno:201
2024-12-27 03:56:47,411 - INFO -    1 of 15 -- Successfully processed pageno:202
2024-12-27 03:56:57,857 - INFO -    2 of 15 -- Successfully processed pageno:203
2024-12-27 03:57:07,891 - INFO -    3 of 15 -- Successfully processed pageno:204
2024-12-27 03:57:13,309 - INFO -    4 of 15 -- Successfully processed pageno:205
2024-12-27 03:57:23,046 - INFO -    5 of 15 -- Successfully processed pageno:206
2024-12-27 03:57:32,578 - INFO -    6 of 15 -- Successfully processed pageno:207
2024-12-27 03:57:41,072 - INFO -    7 of 15 -- Successfully processed pageno:208
2024-12-27 03:57:53,267 - INFO -    8 of 15 -- Successfully processed pageno:209
2024-12-27 03:58:12,378 - INFO -    9 of 15 -- Successfully processed pageno:210
2024-12-27 03:58:20,276 - INFO -   10 of 15 -- Successfully processed pageno:211
2024-12-27 03:58:2

In [228]:
pageno = sorted(english_texts_2.keys())[0]
print('pageno:', pageno)

print('--'*22)
print('ORIGINAL:')
print(english_texts[pageno])
print('--'*22)
print('FIXED:')
print(english_texts_2[pageno]) 

pageno: 201
--------------------------------------------
ORIGINAL:

<header>The Winter Masurian Battle.</header>
<body>February 25th departed to Lyk for the 8th Army. At Tauroggen, only the weak Hoffmann detachment secured¹).
In this grouping, the 10th Army had to repel the fierce Russian attacks of the coming days. Constant reinforcement on the enemy side gradually led to a balance of the situation. There were days of bitter local battles, in which the previous victor and attacker were completely pushed into defense.
The intention of the 10th Army High Command, on February 24th, to attack from the Augustowfski Canal to Rygalowka with the XXXVIII and XXXX Reserve Corps as well as the 2nd Infantry Division, did not come to fruition, as in the early morning hours of February 23rd, the Russians succeeded in crossing the lowlands before the 4th Cavalry Division and advancing further towards Lissow and Wrotnik. General Litzmann therefore had to request the support of the 75th Reserve Divisi

In [225]:
pageno = sorted(english_texts_2.keys())[1]
print('pageno:', pageno, ' | ', english_texts_2[pageno] == english_texts[pageno])
print('--'*22)
print('ORIGINAL:')
print(english_texts[pageno])
print('--'*22)
print('FIXED:')
print(english_texts_2[pageno]) 

pageno: 202  |  False
--------------------------------------------
ORIGINAL:

<header>Successful Battles for the Bobr Section.</header>
<body>The 75th Reserve Division was stuck. The enemy fire was so intense that any movement on the 2 to 3 km long dam, which led through the swamp depression, had to be avoided. An attack was only to be launched after thorough artillery preparation.
In the middle section, the 4th Cavalry and 79th Reserve Division succeeded on February 25 in taking Stabbin after fierce local fighting and pushing the Russians back to the south bank. The enemy had cleared the north bank in front of the 80th Reserve Division. General Litzmann intended to advance over Ostrow at dawn on February 27 after consolidating stronger forces.
However, the prospects for success continued to deteriorate. On February 25, the weak front parts of the 75th Reserve Division fell into captivity at Dwugly after bravely enduring enemy fire all day in the snow water of the Bobr swamp without as

---
## Experiments