In [1]:
# # split the pdf into separate pages

# import PyPDF2
# from src.document_generation import setup_logger
# import logging

# logger = logging.getLogger('logger_name')
# if logger.hasHandlers():
#     logger.handlers.clear()  # Clear existing handlers to avoid duplicates
# logger.setLevel(logging.DEBUG)
# handler = logging.StreamHandler()
# formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
# handler.setFormatter(formatter)
# logger.addHandler(handler)
# logger.propagate = False

# # Input PDF file path
# input_pdf_path = "../input_data/Der Weltkrieg v7 East Front.pdf"
# output_folder = "../input_data/Der Weltkrieg v7"

# # Ensure the output folder exists
# os.makedirs(output_folder, exist_ok=True)

# # Open the PDF and split pages
# try:
#     with open(input_pdf_path, 'rb') as file:
#         pdf_reader = PyPDF2.PdfReader(file)
#         num_pages = len(pdf_reader.pages)
#         logger.info(f"Extracted {num_pages} pages from PDF")

#         for i, page in enumerate(pdf_reader.pages):
#             # Create a new PDF writer for each page
#             pdf_writer = PyPDF2.PdfWriter()
#             pdf_writer.add_page(page)

#             # Save the current page to a new file
#             output_file_path = os.path.join(output_folder, f"page_{i+1:03d}.pdf")
#             with open(output_file_path, 'wb') as output_file:
#                 pdf_writer.write(output_file)
            
#             logger.debug(f"Saved page {i+1} to {output_file_path}")
#             if i == 10:
#                 break

#     logger.info(f"All pages have been split and saved to {output_folder}")

# except Exception as e:
#     logger.error(f"An error occurred: {e}")


In [1]:
%%time
%load_ext autoreload
%autoreload 2

# Standard Python modules
import time 
import numpy as np
import os
import sys
import json
import re
import glob
import asyncio
import aiohttp
import openai
import logging
from typing import Dict, Tuple, List, Callable

from dotenv import load_dotenv
from pathlib import Path

# Load environment variables from .env file
env_path = Path('../.env')  # Adjust path if needed
load_dotenv(dotenv_path=env_path)

# # Get the root path of the project
sys.path.append(os.path.abspath(".."))

# Display and plotting
from IPython.display import display, HTML, clear_output

# Project imports
from src.utils import timeit, encode_image, plt, pylab
from src.processing import compute_log_spectrum_1d, extract_image_bbox, save_images
from src.api_requests import construct_payload, process_single_page
from src.document_generation import save_document

# Set notebook display width
display(HTML("<style>.container { width:90% !important; }</style>"))

# Print Python environment info
print('sys.executable:', sys.executable)
print('sys.version:', sys.version, '\n')

# Setup for PDF processing
foldername = "Der Weltkrieg v8"

# OpenAI API setup
openai.api_key = os.getenv("OPENAI_API_KEY")
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {openai.api_key}"
}

# Configuration
model_name = "gpt-4o-2024-08-06"
plotter = False
image_path = f"../input_data/{foldername}/*pdf"
fnames = sorted(glob.glob(image_path))

# Storage for processed texts
raw_german_texts: Dict[str, str] = {}
german_texts: Dict[str, str] = {}
english_texts: Dict[str, str] = {}


sys.executable: /Users/ozkansafak/code/fraktur/.venv/bin/python3
sys.version: 3.10.9 (main, Mar  1 2023, 12:20:14) [Clang 14.0.6 ] 

CPU times: user 1.02 s, sys: 2.4 s, total: 3.42 s
Wall time: 691 ms


---
## Fraktur Translator

In [2]:
# Decorator to log execution time
def log_execution_time(func: Callable):
    async def wrapper(*args, **kwargs):
        start = time.time()
        result = await func(*args, **kwargs)
        logger.info(f"Finished {func.__name__} in {time.time() - start:.2f} seconds.")
        return result
    return wrapper

# Configure logging
logger = logging.getLogger("time_logger")
if logger.hasHandlers():
    logger.handlers.clear()  # Clear existing handlers to avoid duplicates

logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

# Don't propagate message to parent loggers
logger.propagate = False 


In [5]:
@log_execution_time
async def main(fnames, extract=True):
    
    # Create semaphore to limit concurrent API calls
    semaphore = asyncio.Semaphore(5)  # Adjust number based on API limits
    
    async def _process_page(fname: str) -> Tuple[str, Dict]:
        global raw_german_texts, german_texts, english_texts  # Explicitly declare globals
        
        async with semaphore:
            pageno = re.search(r'page_(.*?)\.pdf', fname, re.DOTALL).group(1)
            logger.info(f"Processing page:{pageno}")
            
            result = await process_single_page(fname, model_name, headers, plotter, pageno, extract) 
            return pageno, result
    
    # Create list of coroutine objects
    tasks = [_process_page(fname) for fname in fnames]
    
    # Process tasks as they complete
    for completed_task in asyncio.as_completed(tasks):
        try:
            pageno, (content, raw_german_text, german_text, english_text) = await completed_task
            raw_german_texts[pageno] = raw_german_text
            german_texts[pageno] = german_text
            english_texts[pageno] = english_text
            logger.info(f"Completed processing page:{pageno}")
        except Exception as e:
            logger.error(f"Error processing a task: {e}")

    return 

# Run the async code
await main(fnames)


2024-11-29 18:08:23,390 - INFO - Processing page:226
2024-11-29 18:08:24,816 - INFO - Processing page:371
2024-11-29 18:08:26,112 - INFO - Processing page:227
2024-11-29 18:08:27,389 - INFO - Processing page:372
2024-11-29 18:08:28,701 - INFO - Processing page:228
2024-11-29 18:08:46,142 - INFO - Processing page:373
2024-11-29 18:08:47,547 - INFO - Completed processing page:371
2024-11-29 18:08:47,550 - INFO - Processing page:229
2024-11-29 18:08:48,873 - INFO - Completed processing page:372
2024-11-29 18:08:49,512 - INFO - Processing page:374
2024-11-29 18:08:50,876 - INFO - Completed processing page:226
2024-11-29 18:08:50,886 - INFO - Processing page:230
2024-11-29 18:08:52,354 - INFO - Completed processing page:227
2024-11-29 18:08:52,602 - INFO - Processing page:375
2024-11-29 18:08:54,071 - INFO - Completed processing page:228
2024-11-29 18:09:10,636 - INFO - Processing page:231
2024-11-29 18:09:12,022 - INFO - Completed processing page:373
2024-11-29 18:09:12,027 - INFO - Proces

In [6]:
def get_missing_keys(raw_german_texts):
    # print missing keys
    missing_keys = [ key for key in sorted(raw_german_texts.keys()) if len(raw_german_texts[key]) < 10]
    for key in missing_keys:
        if len(raw_german_texts[key]) < 10:
            print(key, raw_german_texts[key])
    return missing_keys

missing_keys = set(get_missing_keys(raw_german_texts))
# ---------------------

missed_fnames = []
for fname in fnames:
    pageno = fname.split('/')[-1]
    match = re.search(r'_(.*?)\.pdf', pageno)
    page_number = match.group(1)
    if page_number in missing_keys:
        print(f"page_number {page_number}")
        
        missed_fnames.append(fname)
print("missed_fnames:")
print(missed_fnames)

# Rerun the missing pages without performing FFT based extraction. 
await main(missed_fnames, extract=False)


2024-11-29 18:57:49,377 - INFO - Processing page:008


004 
006 
007 
008 
116 
240 
476 
page_number 004
page_number 006
page_number 007
page_number 008
page_number 116
page_number 240
page_number 476
missed_fnames:
['../input_data/Der Weltkrieg v8/page_004.pdf', '../input_data/Der Weltkrieg v8/page_006.pdf', '../input_data/Der Weltkrieg v8/page_007.pdf', '../input_data/Der Weltkrieg v8/page_008.pdf', '../input_data/Der Weltkrieg v8/page_116.pdf', '../input_data/Der Weltkrieg v8/page_240.pdf', '../input_data/Der Weltkrieg v8/page_476.pdf']


2024-11-29 18:57:50,971 - INFO - Processing page:240
2024-11-29 18:57:52,458 - INFO - Processing page:116
2024-11-29 18:57:53,956 - INFO - Processing page:476
2024-11-29 18:57:55,431 - INFO - Processing page:006
2024-11-29 18:58:06,572 - INFO - Processing page:004
2024-11-29 18:58:08,174 - INFO - Completed processing page:008
2024-11-29 18:58:20,707 - INFO - Processing page:007
2024-11-29 18:58:22,510 - INFO - Completed processing page:006
2024-11-29 18:58:22,513 - INFO - Completed processing page:116
2024-11-29 18:58:24,286 - INFO - Completed processing page:240
2024-11-29 18:58:26,230 - INFO - Completed processing page:004
2024-11-29 18:58:56,643 - INFO - Completed processing page:476
2024-11-29 18:59:26,270 - INFO - Pageno: 007, "raw_german" section was not found
2024-11-29 18:59:26,272 - INFO - Pageno: 007, "german" section was not found
2024-11-29 18:59:26,273 - INFO - Pageno: 007, "english" section was not found
2024-11-29 18:59:26,277 - INFO - Completed processing page:007
2024-

In [8]:
# Save json files and .docx files.

from src.document_generation import save_document
# save json outputs

if not os.path.exists(f'../output_data/{foldername}'):
    os.makedirs(f'../output_data/{foldername}')

with open(f'../output_data/{foldername}/english_texts.json', 'w') as f:
    json.dump(english_texts, f)
with open(f'../output_data/{foldername}/german_texts.json', 'w') as f:
    json.dump(german_texts, f)

doc1, fname1 = save_document(german_texts, foldername, language='German')
doc2, fname2 = save_document(english_texts, foldername, language='English')


2024-11-30 20:41:21,161 - INFO - Page 001
2024-11-30 20:41:21,162 - INFO - section_type: header
2024-11-30 20:41:21,163 - INFO - section_type: body
2024-11-30 20:41:21,163 - INFO - section_type: footer
2024-11-30 20:41:21,164 - INFO - Page 002
2024-11-30 20:41:21,165 - INFO - section_type: header
2024-11-30 20:41:21,166 - INFO - section_type: body
2024-11-30 20:41:21,167 - INFO - section_type: footer
2024-11-30 20:41:21,168 - INFO - Page 003
2024-11-30 20:41:21,169 - INFO - section_type: header
2024-11-30 20:41:21,170 - INFO - section_type: body
2024-11-30 20:41:21,171 - INFO - Page 004
2024-11-30 20:41:21,172 - INFO - section_type: header
2024-11-30 20:41:21,173 - INFO - section_type: body
2024-11-30 20:41:21,174 - INFO - Page 005
2024-11-30 20:41:21,175 - INFO - section_type: header
2024-11-30 20:41:21,176 - INFO - section_type: body
2024-11-30 20:41:21,179 - INFO - Page 006
2024-11-30 20:41:21,180 - INFO - section_type: header
2024-11-30 20:41:21,180 - INFO - section_type: body
2024

In [10]:
german_texts['106']

'\n<header>Der Sommerfeldzug der Verbündeten in Galizien.</header>\n<body>noch bedenklicher ist, als sie es schon bisher war. Dennoch scheint mir die unmittelbare Verstärkung unserer Truppen dort nicht angezeigt zu sein. Nur durch Einsatz von außerordentlich viel Blut und Munition würde an einen Erfolg zu denken sein. Wir haben aber allen Anlaß, mit beidem sparsam umzugehen. Näher liegt es, eine Operation gegen die durch Heraussiehen von Verbänden geschwächten Teile der feindlichen Front in Rechnung zu ziehen, um ihrerseits Erleichterung zu bringen. Da die Narewfront in dieser Verbindung keine Rolle spielt, daß jede Offensive dort selbst durch schwächere Kräfte am Sumpfabschnitt leicht zu unabhängbarem Aufenthalt gebracht werden kann, kommt, wie der General Ludendorff bei seiner Besprechung mit mir ja auch erwähnte, nur die Weichsel-Front südlich der Pilica in Frage. Nach meiner Ansicht unterliegt es keinem Zweifel, daß ein Einsatz von drei bis vier frischen Divisionen die dortige fein

``` 
1.  Upload Input folder of pdfs to blob storage.
2.  Read file from s3.
3.  FFT in y -> (x_hi, x_lo), write half_cropped_image to s3
4.  FFT in x -> (y_hi, y_lo), write cropped_image to s3
5.  Read cropped image from s3 -> encode_image -> translate and transcribe -> JSON output

```

### Available models and pricing:
```
"gpt-4o-2024-08-06":
    "price_txt": "$2.50 / 1M input tokens"
    "price_img": "$0.001913 / 1500px^2"
    
"gpt-4o-mini-2024-07-18":
    "price_txt": "$0.150 / 1M input tokens"
    "price_img": "$0.003825 / 1500px^2"
    
```