In [13]:
pwd

'/Users/ozkansafak/code/fraktur/notebooks'

In [15]:
%%time
%load_ext autoreload
%autoreload 2

# Standard Python modules
import time 
import numpy as np
import os
import sys
import json
import re
import glob
import asyncio
import aiohttp
import openai
from typing import Dict, Tuple, List

from dotenv import load_dotenv
from pathlib import Path

# Load environment variables from .env file
env_path = Path('../.env')  # Adjust path if needed
load_dotenv(dotenv_path=env_path)

# # Get the root path of the project
sys.path.append(os.path.abspath(".."))

# Display and plotting
from IPython.core.display import display, HTML
from IPython.display import clear_output

# Project imports
from src.utils import timeit, encode_image, plt, pylab
from src.processing import compute_log_spectrum_1d, extract_image_bbox, save_images
from src.api_requests import construct_payload, process_single_page
from src.document_generation import save_document

# Set notebook display width
display(HTML("<style>.container { width:90% !important; }</style>"))

# Print Python environment info
print('sys.executable:', sys.executable)
print('sys.version:', sys.version, '\n')

# Setup for PDF processing
foldername = "Der Weltkrieg v8"
page_0 = 0

# Rename Splitted PDFs if needed
pdfpaths = sorted(glob.glob(f"../input_data/{foldername}/*.pdf"))

for i, pdfpath in enumerate(pdfpaths):
    pdfname = pdfpath.split("/")[-1]
    dirname = "/".join(pdfpath.split("/")[:-1])
    new_path = f"{dirname}/page_{i + page_0:04d}.pdf"
    print(f"Renaming {pdfname} to {os.path.basename(new_path)}")
    os.rename(pdfpath, new_path)

# OpenAI API setup
openai.api_key = os.getenv("OPENAI_API_KEY")
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {openai.api_key}"
}

# Configuration
model_name = "gpt-4o-2024-08-06"
plotter = False
image_path = f"../input_data/{foldername}/*pdf"
fnames = sorted(glob.glob(image_path))

# Storage for processed texts
raw_german_texts: Dict[str, str] = {}
german_texts: Dict[str, str] = {}
english_texts: Dict[str, str] = {}


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload




sys.executable: /Users/ozkansafak/code/fraktur/.venv/bin/python3
sys.version: 3.10.9 (main, Mar  1 2023, 12:20:14) [Clang 14.0.6 ] 

Renaming 001.pdf to page_0000.pdf
Renaming 002.pdf to page_0001.pdf
Renaming 003.pdf to page_0002.pdf
Renaming 004.pdf to page_0003.pdf
Renaming 005.pdf to page_0004.pdf
Renaming 006.pdf to page_0005.pdf
Renaming 007.pdf to page_0006.pdf
Renaming 008.pdf to page_0007.pdf
Renaming 009.pdf to page_0008.pdf
Renaming 010.pdf to page_0009.pdf
Renaming 011.pdf to page_0010.pdf
Renaming 012.pdf to page_0011.pdf
Renaming 013.pdf to page_0012.pdf
Renaming 014.pdf to page_0013.pdf
Renaming 015.pdf to page_0014.pdf
Renaming 016.pdf to page_0015.pdf
Renaming 017.pdf to page_0016.pdf
Renaming 018.pdf to page_0017.pdf
Renaming 019.pdf to page_0018.pdf
Renaming 020.pdf to page_0019.pdf
Renaming 021.pdf to page_0020.pdf
Renaming 022.pdf to page_0021.pdf
Renaming 023.pdf to page_0022.pdf
Renaming 024.pdf to page_0023.pdf
Renaming 025.pdf to page_0024.pdf
Renaming 026.pdf 

---
## Fraktur Translator

In [16]:
async def main():
    # Create semaphore to limit concurrent API calls
    semaphore = asyncio.Semaphore(5)  # Adjust number based on API limits
    start_time = time.time()
    
    async def process_page(fname: str) -> Tuple[str, Dict]:
        async with semaphore:
            pageno = re.search(r'page_(.*?)\.pdf', fname, re.DOTALL).group(1)
            print(f"\n{'>'*10} Processing page:{pageno}", end="")
            
            result = await process_single_page(fname, model_name, headers, plotter, pageno) 
            return pageno, result
    
    # Create tasks for all pages
    tasks = [process_page(fname) for fname in fnames]
    
    # Wait for all tasks to complete
    results = await asyncio.gather(*tasks)
    
    # Process results
    for pageno, (content, raw_german_text, german_text, english_text) in results:
        raw_german_texts[pageno] = raw_german_text
        german_texts[pageno] = german_text
        english_texts[pageno] = english_text
        
    total_time = time.time() - start_time
    print(f"\nTotal Runtime: {int(total_time//60)} mins {(total_time%60):.2f} secs")

# Run the async code
await main()



>>>>>>>>>> Processing page:0000
>>>>>>>>>> Processing page:0001
>>>>>>>>>> Processing page:0002
>>>>>>>>>> Processing page:0003
>>>>>>>>>> Processing page:0004
>>>>>>>>>> Processing page:0005
>>>>>>>>>> Processing page:0006
>>>>>>>>>> Processing page:0007
>>>>>>>>>> Processing page:0008
>>>>>>>>>> Processing page:0009
>>>>>>>>>> Processing page:0010

2024-11-17 21:22:04,733 - INFO - "raw_german" section was not found
2024-11-17 21:22:04,735 - INFO - "german" section was not found
2024-11-17 21:22:04,737 - INFO - "english" section was not found



>>>>>>>>>> Processing page:0011

2024-11-17 21:22:09,361 - INFO - "raw_german" section was not found
2024-11-17 21:22:09,363 - INFO - "german" section was not found
2024-11-17 21:22:09,363 - INFO - "english" section was not found



>>>>>>>>>> Processing page:0012
>>>>>>>>>> Processing page:0013

2024-11-17 21:22:12,952 - INFO - "raw_german" section was not found
2024-11-17 21:22:12,952 - INFO - "german" section was not found
2024-11-17 21:22:12,953 - INFO - "english" section was not found



>>>>>>>>>> Processing page:0014

2024-11-17 21:22:19,602 - INFO - "raw_german" section was not found
2024-11-17 21:22:19,603 - INFO - "german" section was not found
2024-11-17 21:22:19,604 - INFO - "english" section was not found



>>>>>>>>>> Processing page:0015
>>>>>>>>>> Processing page:0016
>>>>>>>>>> Processing page:0017
>>>>>>>>>> Processing page:0018
>>>>>>>>>> Processing page:0019
>>>>>>>>>> Processing page:0020
>>>>>>>>>> Processing page:0021
>>>>>>>>>> Processing page:0022
>>>>>>>>>> Processing page:0023
>>>>>>>>>> Processing page:0024
>>>>>>>>>> Processing page:0025
>>>>>>>>>> Processing page:0026

2024-11-17 21:23:34,660 - INFO - "raw_german" section was not found
2024-11-17 21:23:34,661 - INFO - "german" section was not found
2024-11-17 21:23:34,662 - INFO - "english" section was not found



>>>>>>>>>> Processing page:0027
>>>>>>>>>> Processing page:0028
>>>>>>>>>> Processing page:0029
>>>>>>>>>> Processing page:0030
>>>>>>>>>> Processing page:0031
>>>>>>>>>> Processing page:0032
>>>>>>>>>> Processing page:0033
>>>>>>>>>> Processing page:0034
>>>>>>>>>> Processing page:0035
>>>>>>>>>> Processing page:0036
>>>>>>>>>> Processing page:0037
>>>>>>>>>> Processing page:0038
>>>>>>>>>> Processing page:0039
>>>>>>>>>> Processing page:0040
>>>>>>>>>> Processing page:0041
>>>>>>>>>> Processing page:0042
>>>>>>>>>> Processing page:0043
>>>>>>>>>> Processing page:0044
>>>>>>>>>> Processing page:0045
>>>>>>>>>> Processing page:0046
>>>>>>>>>> Processing page:0047
>>>>>>>>>> Processing page:0048
>>>>>>>>>> Processing page:0049
>>>>>>>>>> Processing page:0050
>>>>>>>>>> Processing page:0051
>>>>>>>>>> Processing page:0052
>>>>>>>>>> Processing page:0053
>>>>>>>>>> Processing page:0054
>>>>>>>>>> Processing page:0055
>>>>>>>>>> Processing page:0056
>>>>>>>>>> Processing page:0057
>>>>>>>

2024-11-17 21:42:23,031 - INFO - "raw_german" section was not found
2024-11-17 21:42:23,034 - INFO - "german" section was not found
2024-11-17 21:42:23,035 - INFO - "english" section was not found



>>>>>>>>>> Processing page:0216
>>>>>>>>>> Processing page:0217
>>>>>>>>>> Processing page:0218
>>>>>>>>>> Processing page:0219
>>>>>>>>>> Processing page:0220
>>>>>>>>>> Processing page:0221
>>>>>>>>>> Processing page:0222
>>>>>>>>>> Processing page:0223
>>>>>>>>>> Processing page:0224
>>>>>>>>>> Processing page:0225
>>>>>>>>>> Processing page:0226
>>>>>>>>>> Processing page:0227
>>>>>>>>>> Processing page:0228
>>>>>>>>>> Processing page:0229
>>>>>>>>>> Processing page:0230
>>>>>>>>>> Processing page:0231
>>>>>>>>>> Processing page:0232
>>>>>>>>>> Processing page:0233
>>>>>>>>>> Processing page:0234
>>>>>>>>>> Processing page:0235
>>>>>>>>>> Processing page:0236
>>>>>>>>>> Processing page:0237
>>>>>>>>>> Processing page:0238
>>>>>>>>>> Processing page:0239
>>>>>>>>>> Processing page:0240
>>>>>>>>>> Processing page:0241
>>>>>>>>>> Processing page:0242
>>>>>>>>>> Processing page:0243
>>>>>>>>>> Processing page:0244
>>>>>>>>>> Processing page:0245
>>>>>>>>>> Processing page:0246
>>>>>>>

2024-11-17 22:02:10,574 - INFO - "raw_german" section was not found
2024-11-17 22:02:10,576 - INFO - "german" section was not found
2024-11-17 22:02:10,576 - INFO - "english" section was not found



>>>>>>>>>> Processing page:0425
>>>>>>>>>> Processing page:0426
>>>>>>>>>> Processing page:0427
>>>>>>>>>> Processing page:0428
>>>>>>>>>> Processing page:0429
>>>>>>>>>> Processing page:0430
>>>>>>>>>> Processing page:0431
>>>>>>>>>> Processing page:0432
>>>>>>>>>> Processing page:0433
>>>>>>>>>> Processing page:0434
>>>>>>>>>> Processing page:0435
>>>>>>>>>> Processing page:0436
>>>>>>>>>> Processing page:0437
>>>>>>>>>> Processing page:0438
>>>>>>>>>> Processing page:0439
>>>>>>>>>> Processing page:0440
>>>>>>>>>> Processing page:0441
>>>>>>>>>> Processing page:0442
>>>>>>>>>> Processing page:0443
>>>>>>>>>> Processing page:0444
>>>>>>>>>> Processing page:0445
>>>>>>>>>> Processing page:0446
>>>>>>>>>> Processing page:0447
>>>>>>>>>> Processing page:0448
>>>>>>>>>> Processing page:0449
>>>>>>>>>> Processing page:0450
>>>>>>>>>> Processing page:0451
>>>>>>>>>> Processing page:0452
>>>>>>>>>> Processing page:0453
>>>>>>>>>> Processing page:0454
>>>>>>>>>> Processing page:0455
>>>>>>>

2024-11-17 22:07:05,286 - INFO - "raw_german" section was not found
2024-11-17 22:07:05,288 - INFO - "german" section was not found
2024-11-17 22:07:05,289 - INFO - "english" section was not found



>>>>>>>>>> Processing page:0468
>>>>>>>>>> Processing page:0469
>>>>>>>>>> Processing page:0470
>>>>>>>>>> Processing page:0471
>>>>>>>>>> Processing page:0472
>>>>>>>>>> Processing page:0473
>>>>>>>>>> Processing page:0474

2024-11-17 22:08:03,656 - INFO - "raw_german" section was not found
2024-11-17 22:08:03,658 - INFO - "german" section was not found
2024-11-17 22:08:03,659 - INFO - "english" section was not found



>>>>>>>>>> Processing page:0475
>>>>>>>>>> Processing page:0476
>>>>>>>>>> Processing page:0477
>>>>>>>>>> Processing page:0478

2024-11-17 22:08:59,258 - INFO - "raw_german" section was not found
2024-11-17 22:08:59,261 - INFO - "german" section was not found
2024-11-17 22:08:59,262 - INFO - "english" section was not found



>>>>>>>>>> Processing page:0479
>>>>>>>>>> Processing page:0480
>>>>>>>>>> Processing page:0481
>>>>>>>>>> Processing page:0482
>>>>>>>>>> Processing page:0483
>>>>>>>>>> Processing page:0484
>>>>>>>>>> Processing page:0485

2024-11-17 22:10:11,141 - INFO - "raw_german" section was not found
2024-11-17 22:10:11,141 - INFO - "german" section was not found
2024-11-17 22:10:11,141 - INFO - "english" section was not found



>>>>>>>>>> Processing page:0486
Total Runtime: 50 mins 28.61 secs


In [17]:
from src.document_generation import save_document
# save json outputs

if not os.path.exists(f'../output_data/{foldername}'):
    os.makedirs(f'../output_data/{foldername}')

with open(f'../output_data/{foldername}/english_texts.json', 'w') as f:
    json.dump(english_texts, f)
with open(f'../output_data/{foldername}/german_texts.json', 'w') as f:
    json.dump(german_texts, f)

doc1, fname1 = save_document(german_texts, foldername, language='German')
doc2, fname2 = save_document(english_texts, foldername, language='English')


2024-11-17 22:18:05,906 - INFO - Page 0000
2024-11-17 22:18:05,908 - INFO - section_type: header
2024-11-17 22:18:05,909 - INFO - section_type: body
2024-11-17 22:18:05,910 - INFO - section_type: footer
2024-11-17 22:18:05,912 - INFO - Page 0001
2024-11-17 22:18:05,913 - INFO - section_type: header
2024-11-17 22:18:05,914 - INFO - section_type: body
2024-11-17 22:18:05,915 - INFO - section_type: footer
2024-11-17 22:18:05,916 - INFO - Page 0002
2024-11-17 22:18:05,917 - INFO - section_type: header
2024-11-17 22:18:05,918 - INFO - section_type: body
2024-11-17 22:18:05,919 - INFO - Page 0003
2024-11-17 22:18:05,921 - INFO - Page 0004
2024-11-17 22:18:05,922 - INFO - section_type: header
2024-11-17 22:18:05,923 - INFO - section_type: body
2024-11-17 22:18:05,925 - INFO - Page 0005
2024-11-17 22:18:05,926 - INFO - Page 0006
2024-11-17 22:18:05,927 - INFO - Page 0007
2024-11-17 22:18:05,928 - INFO - Page 0008
2024-11-17 22:18:05,929 - INFO - section_type: header
2024-11-17 22:18:05,930 - I

In [20]:
german_texts['0106']

'\n<header>Generaloberst v. Mackensen erbittet Einsatz d. Verstärkungen bei 11. Armee.</header>\n<body>dem Einsatz von Verstärkungen an jeder anderen Stelle sah er zudem einen „nicht wieder gut zu machenden Zeitverlust“).\nEine dritte Möglichkeit schwebte dem Chef des ö.u. Generalstabes vor. Dieser hatte zwar am 21. Mai in einer Aussprache in Teschen General von Falkenhayn gegenüber die Notwendigkeit anerkannt, die Angriffskraft der Russen durch weitere Schläge in Galizien zu lähmen, sich aber am 23. Mai schriftlich dahin geäußert, daß der Wunsch, „mit den russischen Kräften möglichst weitgehend abzurechnen“ seine Grenze an der dringendsten Forderung finden müsse, „die Italiener nicht bis in jene Gebiete vordringen zu lassen, bei deren Verlust die Monarchie tief getroffen und die Führung des Krieges überhaupt unmöglich gemacht, der Krieg somit zugunsten unserer Gegner entschieden wäre“. Bei dieser Grundstellung war General von Conrad nicht abgeneigt, von weitreichenden Operationen auf 

``` 
1.  Upload Input folder of pdfs to blob storage.
2.  Read file from s3.
3.  FFT in y -> (x_hi, x_lo), write half_cropped_image to s3
4.  FFT in x -> (y_hi, y_lo), write cropped_image to s3
5.  Read cropped image from s3 -> encode_image -> translate and transcribe -> JSON output

```

### Available models and pricing:
```
"gpt-4o-2024-08-06":
    "price_txt": "$2.50 / 1M input tokens"
    "price_img": "$0.001913 / 1500px^2"
    
"gpt-4o-mini-2024-07-18":
    "price_txt": "$0.150 / 1M input tokens"
    "price_img": "$0.003825 / 1500px^2"
    
```