In [1]:
pwd

'/Users/ozkansafak/code/fraktur/notebooks'

In [2]:
%%time
%load_ext autoreload
%autoreload 2

# Standard Python modules
import time 
import numpy as np
import os
import sys
import json
import re
import glob
import asyncio
import aiohttp
import openai
from typing import Dict, Tuple, List

from dotenv import load_dotenv
from pathlib import Path

# Load environment variables from .env file
env_path = Path('../.env')  # Adjust path if needed
load_dotenv(dotenv_path=env_path)

# # Get the root path of the project
sys.path.append(os.path.abspath(".."))

# Display and plotting
from IPython.core.display import display, HTML
from IPython.display import clear_output

# Project imports
from src.utils import timeit, encode_image, plt, pylab
from src.processing import compute_log_spectrum_1d, extract_image_bbox, save_images
from src.api_requests import construct_payload, process_single_page
from src.document_generation import save_document

# Set notebook display width
display(HTML("<style>.container { width:90% !important; }</style>"))

# Print Python environment info
print('sys.executable:', sys.executable)
print('sys.version:', sys.version, '\n')

# Setup for PDF processing
foldername = "Pages from Der Weltkrieg v8 chap 4A"
pageno = 103

# Rename Split PDFs if needed
pdfpaths = glob.glob(f"../input_data/{foldername}/*.pdf")
for i, pdfpath in enumerate(sorted(pdfpaths)):
    pdfname = pdfpath.split("/")[-1]
    newname = "/".join(pdfpath.split("/")[:-1])
    new_path = f"{newname}/page_{str(i+pageno)}.pdf"
    print(f"Renaming {pdfname} to {os.path.basename(new_path)}")
    os.rename(pdfpath, new_path)

# OpenAI API setup
openai.api_key = os.getenv("OPENAI_API_KEY")
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {openai.api_key}"
}

# Configuration
model_name = "gpt-4o-2024-08-06"
plotter = False
image_path = f"../input_data/{foldername}/*pdf"
fnames = sorted(glob.glob(image_path))

# Storage for processed texts
raw_german_texts: Dict[str, str] = {}
german_texts: Dict[str, str] = {}
english_texts: Dict[str, str] = {}




sys.executable: /Users/ozkansafak/code/fraktur/.venv/bin/python3
sys.version: 3.10.9 (main, Mar  1 2023, 12:20:14) [Clang 14.0.6 ] 

Renaming page_103.pdf to page_103.pdf
Renaming page_104.pdf to page_104.pdf
Renaming page_105.pdf to page_105.pdf
Renaming page_106.pdf to page_106.pdf
Renaming page_107.pdf to page_107.pdf
Renaming page_108.pdf to page_108.pdf
Renaming page_109.pdf to page_109.pdf
Renaming page_110.pdf to page_110.pdf
Renaming page_111.pdf to page_111.pdf
Renaming page_112.pdf to page_112.pdf
Renaming page_113.pdf to page_113.pdf
Renaming page_114.pdf to page_114.pdf
Renaming page_115.pdf to page_115.pdf
Renaming page_116.pdf to page_116.pdf
Renaming page_117.pdf to page_117.pdf
Renaming page_118.pdf to page_118.pdf
Renaming page_119.pdf to page_119.pdf
Renaming page_120.pdf to page_120.pdf
Renaming page_121.pdf to page_121.pdf
Renaming page_122.pdf to page_122.pdf
Renaming page_123.pdf to page_123.pdf
Renaming page_124.pdf to page_124.pdf
Renaming page_125.pdf to page_1

---
## Fraktur Translator

In [5]:
async def main():
    # Create semaphore to limit concurrent API calls
    semaphore = asyncio.Semaphore(5)  # Adjust number based on API limits
    start_time = time.time()
    
    async def process_page(fname: str) -> Tuple[str, Dict]:
        async with semaphore:
            pageno = re.search(r'page_(.*?)\.pdf', fname, re.DOTALL).group(1)
            print(f"\n{'>'*10} Processing page:{pageno}", end="")
            
            page_start = time.time()
            result = await process_single_page(fname, model_name, headers, plotter, pageno)
            delta = time.time() - page_start
            
            return pageno, result
    
    # Create tasks for all pages
    tasks = [process_page(fname) for fname in fnames]
    
    # Wait for all tasks to complete
    results = await asyncio.gather(*tasks)
    
    # Process results
    for pageno, (content, raw_german_text, german_text, english_text) in results:
        raw_german_texts[pageno] = raw_german_text
        german_texts[pageno] = german_text
        english_texts[pageno] = english_text
        
    total_time = time.time() - start_time
    print(f"\nTotal Runtime: {int(total_time//60)} mins {(total_time%60):.2f} secs")

# Run the async code
await main()



>>>>>>>>>> Processing page:103
>>>>>>>>>> Processing page:104
>>>>>>>>>> Processing page:105
>>>>>>>>>> Processing page:106
>>>>>>>>>> Processing page:107
>>>>>>>>>> Processing page:108
>>>>>>>>>> Processing page:109
>>>>>>>>>> Processing page:110
>>>>>>>>>> Processing page:111
>>>>>>>>>> Processing page:112
>>>>>>>>>> Processing page:113
>>>>>>>>>> Processing page:114
>>>>>>>>>> Processing page:115
>>>>>>>>>> Processing page:116
>>>>>>>>>> Processing page:117
>>>>>>>>>> Processing page:118
>>>>>>>>>> Processing page:119
>>>>>>>>>> Processing page:120
>>>>>>>>>> Processing page:121
>>>>>>>>>> Processing page:122
>>>>>>>>>> Processing page:123
>>>>>>>>>> Processing page:124
>>>>>>>>>> Processing page:125
>>>>>>>>>> Processing page:126
>>>>>>>>>> Processing page:127
>>>>>>>>>> Processing page:128
>>>>>>>>>> Processing page:129
>>>>>>>>>> Processing page:130
>>>>>>>>>> Processing page:131
>>>>>>>>>> Processing page:132
>>>>>>>>>> Processing page:133
>>>>>>>>>> Processing page:134
>>>>>>>

In [6]:
from src.api_requests import extract_text_section
for key in sorted(raw_german_texts.keys()):
    print('--'*2, key, '--'*20)
    print(':raw_german_texts')
    print(raw_german_texts[key])
    print(':german_texts')
    print(extract_text_section(german_texts[key], section='body'))
    if key == '106':
        break
    

---- 103 ----------------------------------------
:raw_german_texts

IV. Der Krieg gegen Rußland im Sommer und Herbst 1915.
A. Die Front des Oberbefehlshabers Ost bis zum 2. Juli.
I. Die Weisung der Obersten Heeresleitung vom 16. April.
Karte 18 Band VII.
In der Mitteilung, die die Oberste Heeresleitung dem Oberbefehlshaber Ost am 16. April über die in Galizien beabsichtigte Operation zugeben ließ, hatte es geheißen, seine Mitwirkung durch möglichst lange Täuschung und Bindung des Feindes nördlich der Pilica sei Vorbedingung für das Gelingen der Operation.
Als diese Weisung erteilt wurde, verfügte der Oberbefehlshaber Ost an seiner etwa 750 Kilometer messenden Front von der Pilica bis zur Düna bei Memel insgesamt über 38 Divisionen Infanterie, die überall auf russischem Boden standen, am dichtesten auf dem rechten Flügel, während der äußerste Nordflügel auf einer etwa 200 Kilometer langen Strecke so gut wie unbesetzt war. Die Kämpfe, die dem Abschluß der Winterschlacht in Masuren gefol

In [9]:
from src.document_generation import save_document
# save json outputs
folder_name = 'Chapter_4A'

if not os.path.exists(f'../output_data/{folder_name}'):
    os.makedirs(f'../output_data/{folder_name}')

with open(f'../output_data/{folder_name}/english_texts.json', 'w') as f:
    json.dump(english_texts, f)
with open(f'../output_data/{folder_name}/german_texts.json', 'w') as f:
    json.dump(german_texts, f)

doc1, fname1 = save_document(german_texts, folder_name, language='German')
doc2, fname2 = save_document(english_texts, folder_name, language='English')


2024-11-16 18:23:07,184 - INFO - Page 103
2024-11-16 18:23:07,186 - INFO - section_type: header
2024-11-16 18:23:07,188 - INFO - section_type: body
2024-11-16 18:23:07,190 - INFO - Page 104
2024-11-16 18:23:07,192 - INFO - section_type: header
2024-11-16 18:23:07,193 - INFO - section_type: body
2024-11-16 18:23:07,194 - INFO - section_type: footer
2024-11-16 18:23:07,195 - INFO - Page 105
2024-11-16 18:23:07,197 - INFO - section_type: header
2024-11-16 18:23:07,199 - INFO - section_type: body
2024-11-16 18:23:07,200 - INFO - section_type: footer
2024-11-16 18:23:07,201 - INFO - Page 106
2024-11-16 18:23:07,203 - INFO - section_type: body
2024-11-16 18:23:07,203 - INFO - section_type: header
2024-11-16 18:23:07,204 - INFO - section_type: body
2024-11-16 18:23:07,205 - INFO - section_type: footer
2024-11-16 18:23:07,207 - INFO - Page 107
2024-11-16 18:23:07,208 - INFO - section_type: header
2024-11-16 18:23:07,209 - INFO - section_type: body
2024-11-16 18:23:07,210 - INFO - section_type:

In [10]:
german_texts['106']

'\n<body>Der 9. Armee befahl der Oberbefehlshaber Ost, vom 27. April ab „durch lebhaftes Feuer von an geeigneten Stellen zusammengezogenen Gruppen schwerer Artillerie, durch vermehrte Erkundungstätigkeit sowie Vortreiben von Sappen den Eindruck zu erwecken, daß ein allgemeiner deutscher Angriff bevorsteht“. Die Ausführung dieses Befehls brachte vorübergehend erhöhte Gefechtstätigkeit, die dann am 3. Mai, dem Tage nach dem Angriffsbeginn in Galizien, wieder eingestellt wurde. Bei der Armee-Gruppe Gallwitz waren die Ablenkungsunternehmungen in ähnlichem Rahmen wie bei der 9. Armee gehalten. Sie brachten dem I. Reservekorps am 27. April bei Jednorozec nordöstlich von Przasnysz gegen 500 Gefangene und drei Maschinengewehre als Beute ein. Die 8. Armee wollte die Aufgabe durch einen Vorstoß ihres rechten Flügels lösen. Der dazu für den 29. April zwischen Szwa und Pissa angesetzte, von General der Kavallerie Burggräf und Graf zu Dohna-Schlodien geleitete Angriff der 75. Reserve- und 10. Landw

``` 
1.  Upload Input folder of pdfs to blob storage.
2.  Read file from s3.
3.  FFT in y -> (x_hi, x_lo), write half_cropped_image to s3
4.  FFT in x -> (y_hi, y_lo), write cropped_image to s3
5.  Read cropped image from s3 -> encode_image -> translate and transcribe -> JSON output

```

### Available models and pricing:
```
"gpt-4o-2024-08-06":
    "price_txt": "$2.50 / 1M input tokens"
    "price_img": "$0.001913 / 1500px^2"
    
"gpt-4o-mini-2024-07-18":
    "price_txt": "$0.150 / 1M input tokens"
    "price_img": "$0.003825 / 1500px^2"
    
```