In [1]:
# # split the pdf into separate pages

# import PyPDF2
# from src.document_generation import setup_logger
# import logging

# logger = logging.getLogger('logger_name')
# if logger.hasHandlers():
#     logger.handlers.clear()  # Clear existing handlers to avoid duplicates
# logger.setLevel(logging.DEBUG)
# handler = logging.StreamHandler()
# formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
# handler.setFormatter(formatter)
# logger.addHandler(handler)
# logger.propagate = False

# # Input PDF file path
# input_pdf_path = "../input_data/Der Weltkrieg v7 East Front.pdf"
# output_folder = "../input_data/Der Weltkrieg v7"

# # Ensure the output folder exists
# os.makedirs(output_folder, exist_ok=True)

# # Open the PDF and split pages
# try:
#     with open(input_pdf_path, 'rb') as file:
#         pdf_reader = PyPDF2.PdfReader(file)
#         num_pages = len(pdf_reader.pages)
#         logger.info(f"Extracted {num_pages} pages from PDF")

#         for i, page in enumerate(pdf_reader.pages):
#             # Create a new PDF writer for each page
#             pdf_writer = PyPDF2.PdfWriter()
#             pdf_writer.add_page(page)

#             # Save the current page to a new file
#             output_file_path = os.path.join(output_folder, f"page_{i+1:03d}.pdf")
#             with open(output_file_path, 'wb') as output_file:
#                 pdf_writer.write(output_file)
            
#             logger.debug(f"Saved page {i+1} to {output_file_path}")
#             if i == 10:
#                 break

#     logger.info(f"All pages have been split and saved to {output_folder}")

# except Exception as e:
#     logger.error(f"An error occurred: {e}")


In [2]:
%%time
%load_ext autoreload
%autoreload 2

# Standard Python modules
import time 
import numpy as np
import os
import sys
import json
import re
import glob
import asyncio
import aiohttp
import openai
import logging
from typing import Dict, Tuple, List, Callable

from dotenv import load_dotenv
from pathlib import Path

# Load environment variables from .env file
env_path = Path('../.env')  # Adjust path if needed
load_dotenv(dotenv_path=env_path)

# # Get the root path of the project
sys.path.append(os.path.abspath(".."))

# Display and plotting
from IPython.display import display, HTML, clear_output

# Project imports
from src.utils import timeit, encode_image, plt, pylab
from src.processing import compute_log_spectrum_1d, extract_image_bbox, save_images
from src.api_requests import construct_payload, process_single_page
from src.document_generation import save_document

# Set notebook display width
display(HTML("<style>.container { width:90% !important; }</style>"))

# Print Python environment info
print('sys.executable:', sys.executable)
print('sys.version:', sys.version, '\n')

# Setup for PDF processing
foldername = "Der Weltkrieg v7"

# OpenAI API setup
openai.api_key = os.getenv("OPENAI_API_KEY")
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {openai.api_key}"
}

# Configuration
model_name = "gpt-4o-2024-08-06"
plotter = False
image_path = f"../input_data/{foldername}/*pdf"
fnames = sorted(glob.glob(image_path))

# Storage for processed texts
raw_german_texts: Dict[str, str] = {}
german_texts: Dict[str, str] = {}
english_texts: Dict[str, str] = {}


sys.executable: /Users/ozkansafak/code/fraktur/.venv/bin/python3
sys.version: 3.10.9 (main, Mar  1 2023, 12:20:14) [Clang 14.0.6 ] 

CPU times: user 1.08 s, sys: 2.28 s, total: 3.36 s
Wall time: 507 ms


---
## Fraktur Translator

In [3]:
# Decorator to log wall time
def log_execution_time(func: Callable):
    async def wrapper(*args, **kwargs):
        start = time.time()
        result = await func(*args, **kwargs)
        logger.info(f"Finished {func.__name__} in {time.time() - start:.2f} seconds.")
        return result
    return wrapper

# Configure logging
logger = logging.getLogger("time_logger")
if logger.hasHandlers():
    logger.handlers.clear()  # Clear existing handlers to avoid duplicates

logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

# Don't propagate message to parent loggers
logger.propagate = False 


In [14]:
@log_execution_time
async def main(fnames, extract=True):
    
    # Create semaphore to limit concurrent API calls
    semaphore = asyncio.Semaphore(1)  # Adjust number based on API limits
    
    async def _process_page(fname: str) -> Tuple[str, Dict]:
        global raw_german_texts, german_texts, english_texts  # Explicitly declare globals
        
        async with semaphore:
            pageno = re.search(r'page_(.*?)\.pdf', fname, re.DOTALL).group(1)
            # logger.info(f"Processing page:{pageno}")
            
            # Add delay between requests even with semaphore
            await asyncio.sleep(2)
            
            result = await process_single_page(fname, model_name, headers, plotter, pageno, extract) 
            logger.info(f"Successfully processed page:{pageno}")
            return pageno, result
    
    # Create list of coroutine objects
    tasks = [_process_page(fname) for fname in fnames]
    
    # Process tasks as they complete
    for completed_task in asyncio.as_completed(tasks):
        try:
            pageno, (content, raw_german_text, german_text, english_text) = await completed_task
            raw_german_texts[pageno] = raw_german_text
            german_texts[pageno] = german_text
            english_texts[pageno] = english_text
            logger.info(f"Completed processing page:{pageno}")
        except Exception as e:
            logger.error(f"Error processing a task: {e}")
    return 

# Run the async code
await main(fnames[:2])


In make_claude_request_async


2024-12-20 12:56:41,823 - INFO - Successfully processed page:002
2024-12-20 12:56:41,825 - INFO - Completed processing page:002


In make_claude_request_async


2024-12-20 12:56:51,150 - INFO - Successfully processed page:001
2024-12-20 12:56:51,151 - INFO - Completed processing page:001
2024-12-20 12:56:51,152 - INFO - Finished main in 18.24 seconds.


In [15]:
raw_german_texts

{'001': '\nDer Weltkrieg\n1914 bis 1918\nBearbeitet im\nReichsarchiv\n*\nDie militärischen Operationen zu Land\nSiebenter Band\nVerlegt bei E. S. Mittler & Sohn\nBerlin im Jahre 1931\n',
 '002': '\nDie Operationen des Jahres 1915\nDie Ereignisse im Winter und Frühjahr\nMit vierzig Karten und Skizzen\nVerlegt bei E. S. Mittler & Sohn\nBerlin im Jahre 1931\n',
 '005': '\nInhaltsverzeichnis.\nDie Operationen des Jahres 1915.\nDie Ereignisse im Winter und Frühjahr.\nDie Frage des Schwerpunktes der Kriegführung im Januar 1915 . . . . . . . . . . . . . . . . . . . . . 1\nDer Feldzug im Westen bis Mitte April 1915 . . . . 16\n1. Erwägungen und Maßnahmen der deutschen Obersten Heeresleitung . 16\n2. Die Kampfvorgänge an der Westfront bis Mitte Februar 1915 . . . 21\n3. Die französische und englische Heerführung Anfang 1915 . . . . . 35\n4. Der Höhepunkt der Winterschlacht in der Champagne von Mitte Februar bis Mitte März 1915 . . . . . . . . . . . . . . . 41\n[Content continues as in image...]

In [16]:
# def get_missing_keys(raw_german_texts):
#     # print missing keys
#     missing_keys = [ key for key in sorted(raw_german_texts.keys()) if len(raw_german_texts[key]) < 10]
#     for key in missing_keys:
#         if len(raw_german_texts[key]) < 10:
#             print('key:', key, 'text:', raw_german_texts[key])
#     return missing_keys

# missing_keys = set(get_missing_keys(raw_german_texts))
missing_keys = {"333", "405", "417", "419", "421", "423", "431"}
# ---------------------

missed_fnames = []
for fname in fnames:
    pageno = fname.split('/')[-1]
    match = re.search(r'_(.*?)\.pdf', pageno)
    page_number = match.group(1)
    if page_number in missing_keys:
        print(f"page_number {page_number}")
        
        missed_fnames.append(fname)
print("missed_fnames:")
print(missed_fnames)

# Rerun the missing pages without performing FFT based extraction. 
await main(missed_fnames, extract=False)


page_number 333
page_number 405
page_number 417
page_number 419
page_number 421
page_number 423
page_number 431
missed_fnames:
['../input_data/Der Weltkrieg v7/page_333.pdf', '../input_data/Der Weltkrieg v7/page_405.pdf', '../input_data/Der Weltkrieg v7/page_417.pdf', '../input_data/Der Weltkrieg v7/page_419.pdf', '../input_data/Der Weltkrieg v7/page_421.pdf', '../input_data/Der Weltkrieg v7/page_423.pdf', '../input_data/Der Weltkrieg v7/page_431.pdf']
In make_claude_request_async


2024-12-20 12:57:18,101 - INFO - Pageno: 405, "raw_german" section was not found
2024-12-20 12:57:18,103 - INFO - Pageno: 405, "german" section was not found
2024-12-20 12:57:18,104 - INFO - Pageno: 405, "english" section was not found
2024-12-20 12:57:18,108 - INFO - Successfully processed page:405
2024-12-20 12:57:18,111 - INFO - Completed processing page:405


In make_claude_request_async


2024-12-20 12:57:31,621 - INFO - Successfully processed page:417
2024-12-20 12:57:31,624 - INFO - Completed processing page:417


In make_claude_request_async


2024-12-20 12:57:52,508 - INFO - Successfully processed page:421
2024-12-20 12:57:52,511 - INFO - Completed processing page:421


In make_claude_request_async


2024-12-20 12:58:07,357 - INFO - Successfully processed page:431
2024-12-20 12:58:07,360 - INFO - Completed processing page:431


In make_claude_request_async


2024-12-20 12:58:42,481 - INFO - Successfully processed page:333
2024-12-20 12:58:42,484 - INFO - Completed processing page:333


In make_claude_request_async


2024-12-20 12:59:01,937 - INFO - Successfully processed page:419
2024-12-20 12:59:01,939 - INFO - Completed processing page:419


In make_claude_request_async


2024-12-20 12:59:22,111 - INFO - Successfully processed page:423
2024-12-20 12:59:22,114 - INFO - Completed processing page:423
2024-12-20 12:59:22,115 - INFO - Finished main in 130.42 seconds.


In [49]:
set(get_missing_keys(raw_german_texts)) 

333 
405 

417 
419 
421 
423 
431 


{'333', '405', '417', '419', '421', '423', '431'}

In [8]:
# Save json files and .docx files.

from src.document_generation import save_document

# save json outputs
if not os.path.exists(f'../output_data/{foldername}'):
    os.makedirs(f'../output_data/{foldername}')

with open(f'../output_data/{foldername}/english_texts.json', 'w') as f:
    json.dump(english_texts, f)
with open(f'../output_data/{foldername}/german_texts.json', 'w') as f:
    json.dump(german_texts, f)
with open(f'../output_data/{foldername}/raw_german_texts.json', 'w') as f:
    json.dump(raw_german_texts, f)

doc1, fname1 = save_document(german_texts, foldername, language='German')
doc2, fname2 = save_document(english_texts, foldername, language='English')


2024-11-30 20:41:21,161 - INFO - Page 001
2024-11-30 20:41:21,162 - INFO - section_type: header
2024-11-30 20:41:21,163 - INFO - section_type: body
2024-11-30 20:41:21,163 - INFO - section_type: footer
2024-11-30 20:41:21,164 - INFO - Page 002
2024-11-30 20:41:21,165 - INFO - section_type: header
2024-11-30 20:41:21,166 - INFO - section_type: body
2024-11-30 20:41:21,167 - INFO - section_type: footer
2024-11-30 20:41:21,168 - INFO - Page 003
2024-11-30 20:41:21,169 - INFO - section_type: header
2024-11-30 20:41:21,170 - INFO - section_type: body
2024-11-30 20:41:21,171 - INFO - Page 004
2024-11-30 20:41:21,172 - INFO - section_type: header
2024-11-30 20:41:21,173 - INFO - section_type: body
2024-11-30 20:41:21,174 - INFO - Page 005
2024-11-30 20:41:21,175 - INFO - section_type: header
2024-11-30 20:41:21,176 - INFO - section_type: body
2024-11-30 20:41:21,179 - INFO - Page 006
2024-11-30 20:41:21,180 - INFO - section_type: header
2024-11-30 20:41:21,180 - INFO - section_type: body
2024

``` 
1.  Upload Input folder of pdfs to blob storage.
2.  Read file from s3.
3.  FFT in y -> (x_hi, x_lo), write half_cropped_image to s3
4.  FFT in x -> (y_hi, y_lo), write cropped_image to s3
5.  Read cropped image from s3 -> encode_image -> translate and transcribe -> JSON output

```

### Available models and pricing:
```
"gpt-4o-2024-08-06":
    "price_txt": "$2.50 / 1M input tokens"
    "price_img": "$0.001913 / 1500px^2"
    
"gpt-4o-mini-2024-07-18":
    "price_txt": "$0.150 / 1M input tokens"
    "price_img": "$0.003825 / 1500px^2"
    
```

---
## Experiments

In [55]:
base64_image = None
payload = {
    "model": model_name,
    "messages": [
      {"role": "system", "content": "You have three roles. First of all you are a professional OCR assistant. "
       "Secondly, you identify the parts of your transcriptions to belong to header, body and footer sections. "
       "Lastly, you are a GERMAN to ENGLISH translator that stays loyal to the style and "
       "character of the original German text."
      },
      {
        "role": "user",
        "content": [
          {
            "type": "text",
            "text": 
"""Instructions:

You are to perform three steps on the provided image of a document.

**Step 1: OCR Transcription**

Task: Transcribe the entire text from the image into German, including all Fraktur characters.

Attention: Pay close attention to accurately capturing all text elements.

Attention 2: Make sure you're reading each line only once. 

Formatting: Wrap the entire transcription in <raw_german></raw_german> tags.

Caution: Pay attention to identify the paragraphs as a whole and not erroneously place a carriage return at the end of each line.

Separator: When you are done with Step 1, print the separator line:

--------------------------------------------------------------------
**Step 2: Header-Body-Footer Analysis**

Review: Look at the image and your transcription from Step 1.

Verification: Ensure you haven't missed any parts; if you did, transcribe and include them now.

Caution: Pay attention to identify the paragraphs as a whole and not erroneously place a carriage return at the end of each line.

Categorization:

Header: If you detect a header (e.g., chapter title or section heading), wrap it inside <header></header> tags. If there's no header, omit the <header></header> tags.
Body: Wrap the main body of the text inside <body></body> tags.
Footer: If you detect any footnotes, wrap them inside <footer></footer> tags. If there are no footnotes, omit the <footer></footer> tags.
Formatting: Wrap this structured transcription inside <german></german> tags.

Separator: When you are done with Step 2, print the separator line again:

--------------------------------------------------------------------
**Step 3: Translation (German to English)**

Task: Translate the structured German text from Step 2 into English.
Structure: Maintain the same <header>, <body>, and <footer> sections in your translation.
Formatting: Wrap the translated text inside <english></english> tags.

Example Output Format:

<raw_german>
... (transcribed German text) ...
</raw_german>
--------------------------------------------------------------------
<german>
<header> ... </header>
<body> ... </body>
<footer> ... </footer>
</german>
--------------------------------------------------------------------
<english>
<header> ... </header>
<body> ... </body>
<footer> ... </footer>
</english>"""                

          },
          {
            "type": "image_url",
            "image_url": {
              "url": f"data:image/jpeg;base64,{base64_image}"
            }
          }
        ]
      }
    ],
    "max_tokens": 6000,
    "temperature": 0.1
}


In [54]:
payload.keys()


dict_keys(['model', 'messages', 'max_tokens', 'temperature'])

In [78]:
payload['messages'][1]['content'][1]

{'type': 'image_url', 'image_url': {'url': 'data:image/jpeg;base64,None'}}

In [102]:
payload['messages'][1]['content'][1]

{'type': 'image_url', 'image_url': {'url': 'data:image/jpeg;base64,None'}}