In [1]:
# # split the pdf into separate pages

# import PyPDF2
# from src.document_generation import setup_logger
# import logging

# logger = logging.getLogger('logger_name')
# if logger.hasHandlers():
#     logger.handlers.clear()  # Clear existing handlers to avoid duplicates
# logger.setLevel(logging.DEBUG)
# handler = logging.StreamHandler()
# formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
# handler.setFormatter(formatter)
# logger.addHandler(handler)
# logger.propagate = False

# # Input PDF file path
# input_pdf_path = "../input_data/Der Weltkrieg v7 East Front.pdf"
# output_folder = "../input_data/Der Weltkrieg v7"

# # Ensure the output folder exists
# os.makedirs(output_folder, exist_ok=True)

# # Open the PDF and split pages
# try:
#     with open(input_pdf_path, 'rb') as file:
#         pdf_reader = PyPDF2.PdfReader(file)
#         num_pages = len(pdf_reader.pages)
#         logger.info(f"Extracted {num_pages} pages from PDF")

#         for i, page in enumerate(pdf_reader.pages):
#             # Create a new PDF writer for each page
#             pdf_writer = PyPDF2.PdfWriter()
#             pdf_writer.add_page(page)

#             # Save the current page to a new file
#             output_file_path = os.path.join(output_folder, f"page_{i+1:03d}.pdf")
#             with open(output_file_path, 'wb') as output_file:
#                 pdf_writer.write(output_file)
            
#             logger.debug(f"Saved page {i+1} to {output_file_path}")
#             if i == 10:
#                 break

#     logger.info(f"All pages have been split and saved to {output_folder}")

# except Exception as e:
#     logger.error(f"An error occurred: {e}")


In [1]:
%%time
%load_ext autoreload
%autoreload 2

# Standard Python modules
import time 
import numpy as np
import os
import sys
import json
import re
import glob
import asyncio
import aiohttp
import openai
import logging
from typing import Dict, Tuple, List, Callable
from dotenv import load_dotenv
from pathlib import Path

# Decorator to log wall time
def log_execution_time(func: Callable):
    async def wrapper(*args, **kwargs):
        start = time.time()
        result = await func(*args, **kwargs)
        logger.info(f"Finished {func.__name__} in {time.time() - start:.2f} seconds.")
        return result
    return wrapper

def get_missing_keys(raw_german_texts):
    # print missing keys
    missing_keys = [ key for key in sorted(all_pagenos) if key not in raw_german_texts or len(raw_german_texts[key]) < 10 ]
    return missing_keys

# Load environment variables from .env file
env_path = Path('../.env')  # Adjust path if needed
load_dotenv(dotenv_path=env_path)

# # Get the root path of the project
sys.path.append(os.path.abspath(".."))

# Display and plotting
from IPython.display import display, HTML, clear_output

# Project imports
from src.utils import timeit, encode_image, plt, pylab
from src.processing import compute_log_spectrum_1d, extract_image_bbox, save_images
from src.api_requests import construct_payload_for_gpt, process_single_page
from src.document_generation import save_document

# Set notebook display width
display(HTML("<style>.container { width:90% !important; }</style>"))

# Print Python environment info
print('sys.executable:', sys.executable)
print('sys.version:', sys.version, '\n')

# Setup for PDF processing
foldername = "Der Weltkrieg v7"

# OpenAI API setup
openai.api_key = os.getenv("OPENAI_API_KEY")
headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {openai.api_key}"
}

# --------------------
# Initialize variables
# --------------------
plotter = False
image_path = f"../input_data/{foldername}/*pdf"
fnames = sorted(glob.glob(image_path))
all_pagenos = [re.search(r'page_(.*?)\.pdf', fname, re.DOTALL).group(1) for fname in fnames]

# Storage for processed texts
raw_german_texts: Dict[str, str] = {}
german_texts: Dict[str, str] = {}
english_texts: Dict[str, str] = {}

# ------------------
# Configure logging
# ------------------
logger = logging.getLogger("time_logger")
if logger.hasHandlers():
    logger.handlers.clear()  # Clear existing handlers to avoid duplicates

logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)

# Don't propagate message to parent loggers
logger.propagate = False 


sys.executable: /Users/ozkansafak/code/fraktur/.venv/bin/python3
sys.version: 3.10.9 (main, Mar  1 2023, 12:20:14) [Clang 14.0.6 ] 

CPU times: user 982 ms, sys: 2.41 s, total: 3.39 s
Wall time: 516 ms


---
## Fraktur Translator (GPT-4o)

In [8]:
@log_execution_time
async def main(fnames, model_name="", semaphore_count=5, extract=True):
    semaphore = asyncio.Semaphore(semaphore_count)  # Adjust number based on API limits
    async def _process_page(fname: str) -> Tuple[str, Dict]:
        global raw_german_texts, german_texts, english_texts  # Explicitly declare globals
        async with semaphore:
            pageno = re.search(r'page_(.*?)\.pdf', fname, re.DOTALL).group(1)
            result = await process_single_page(fname, model_name, headers, plotter, pageno, extract) 
            return pageno, result
    
    # A list of coroutine objects. include only the unprocessed pages.
    keys = set(raw_german_texts.keys())
    tasks = []
    for fname in fnames:
        pageno = re.search(r'page_(.*?)\.pdf', fname, re.DOTALL).group(1)
        if pageno not in keys:
            tasks.append(_process_page(fname))
    logger.info(f"main: len(tasks): {len(tasks)} -- Processing tasks as they complete")
    
    # Process tasks as they complete
    for i, completed_task in enumerate(asyncio.as_completed(tasks)):
        try:
            pageno, (content, raw_german_text, german_text, english_text) = await completed_task
            raw_german_texts[pageno] = raw_german_text
            german_texts[pageno] = german_text
            english_texts[pageno] = english_text
            logger.info(f" {i:3d} of {len(tasks)} -- Successfully processed page:{pageno}")
        except Exception as e:
            logger.error(f"{i:3d} of {len(tasks)} -- Error processing a task: {e}")
    return 

# Run the async code
await main(fnames[5:8], model_name="gpt-4o-2024-08-06")


2024-12-21 19:06:53,617 - INFO - main: len(tasks): 2 -- Processing tasks as they complete
2024-12-21 19:07:23,058 - INFO -    0 of 2 -- Successfully processed page:005
2024-12-21 19:08:25,485 - INFO - Pageno: 006, "raw_german" section was not found
2024-12-21 19:08:25,487 - INFO - Pageno: 006, "german" section was not found
2024-12-21 19:08:25,488 - INFO - Pageno: 006, "english" section was not found
2024-12-21 19:08:25,494 - INFO -    1 of 2 -- Successfully processed page:006
2024-12-21 19:08:25,495 - INFO - Finished main in 91.88 seconds.


##  Handle missing keys (Claude Sonnet)


In [9]:
# ---------------------
# 1. Rerun the missing pages on Claude
# ---------------------
missing_keys = set(get_missing_keys(raw_german_texts))
for key in missing_keys:
    del raw_german_texts[key]
print(f"missing_keys: {missing_keys}")
missed_fnames = []
for fname in fnames:
    pageno = re.search(r'page_(.*?)\.pdf', fname, re.DOTALL).group(1)
    if pageno in missing_keys:
        print(f"processing pageno: {pageno}")
        missed_fnames.append(fname)
print(f"missed_fnames: ")
[print(item) for item in missed_fnames]

await main(missed_fnames, model_name="claude-3-5-sonnet-20241022", semaphore_count=1, extract=True)

# ---------------------
# 2. If there still are missing pages, run them without performing FFT based extraction. 
#    This time compute missing_keys based on 'english_texts'.
# ---------------------
missing_keys = set(get_missing_keys(english_texts))
for key in missing_keys:
    del raw_german_texts[key]
print(f"missing_keys: {missing_keys}")
missed_fnames = []
for fname in fnames:
    pageno = re.search(r'page_(.*?)\.pdf', fname, re.DOTALL).group(1)
    if pageno in missing_keys:
        print(f"processing pageno: {pageno}")
        missed_fnames.append(fname)
print(f"missed_fnames: ")
for item in missed_fnames:
    print(item) 

await main(missed_fnames, model_name="claude-3-5-sonnet-20241022", semaphore_count=1, extract=False)


2024-12-21 19:08:28,415 - INFO - main: len(tasks): 2 -- Processing tasks as they complete


missing_keys: {'006', '405'}
processing pageno: 006
processing pageno: 405
missed_fnames: 
../input_data/Der Weltkrieg v7/page_006.pdf
../input_data/Der Weltkrieg v7/page_405.pdf


2024-12-21 19:08:33,906 - INFO - Pageno: 405, "raw_german" section was not found
2024-12-21 19:08:33,908 - INFO - Pageno: 405, "german" section was not found
2024-12-21 19:08:33,909 - INFO - Pageno: 405, "english" section was not found
2024-12-21 19:08:35,270 - INFO -    0 of 2 -- Successfully processed page:405
2024-12-21 19:08:59,328 - INFO -    1 of 2 -- Successfully processed page:006
2024-12-21 19:08:59,329 - INFO - Finished main in 30.91 seconds.
2024-12-21 19:08:59,332 - INFO - main: len(tasks): 1 -- Processing tasks as they complete


missing_keys: {'405'}
processing pageno: 405
missed_fnames: 
../input_data/Der Weltkrieg v7/page_405.pdf


2024-12-21 19:09:03,858 - INFO - Pageno: 405, "raw_german" section was not found
2024-12-21 19:09:03,860 - INFO - Pageno: 405, "german" section was not found
2024-12-21 19:09:03,861 - INFO - Pageno: 405, "english" section was not found
2024-12-21 19:09:03,866 - INFO -    0 of 1 -- Successfully processed page:405
2024-12-21 19:09:03,867 - INFO - Finished main in 4.53 seconds.


In [10]:
# Save json files and .docx files.

from src.document_generation import save_document

# save json outputs
# if not os.path.exists(f'../output_data/{foldername}'):
#     os.makedirs(f'../output_data/{foldername}')
# with open(f'../output_data/{foldername}/english_texts.json', 'w') as f:
#     json.dump(english_texts, f)
# with open(f'../output_data/{foldername}/german_texts.json', 'w') as f:
#     json.dump(german_texts, f)
# with open(f'../output_data/{foldername}/raw_german_texts.json', 'w') as f:
#     json.dump(raw_german_texts, f)

doc1, fname1 = save_document(german_texts, foldername, language='German')
doc2, fname2 = save_document(english_texts, foldername, language='English')


``` 
1.  Upload Input folder of pdfs to blob storage.
2.  Read file from s3.
3.  FFT in y -> (x_hi, x_lo), write half_cropped_image to s3
4.  FFT in x -> (y_hi, y_lo), write cropped_image to s3
5.  Read cropped image from s3 -> encode_image -> translate and transcribe -> JSON output

```

### Available models and pricing:
```
"gpt-4o-2024-08-06":
    "price_txt": "$2.50 / 1M input tokens"
    "price_img": "$0.001913 / 1500px^2"
    
"gpt-4o-mini-2024-07-18":
    "price_txt": "$0.150 / 1M input tokens"
    "price_img": "$0.003825 / 1500px^2"
    
```

---
## Load the German text and translate broken sentences.

In [2]:
with open(f'../output_data/{foldername}/english_texts.json', 'r') as f:
    english_texts = json.load(f)
with open(f'../output_data/{foldername}/german_texts.json', 'r') as f:
    german_texts = json.load(f)
with open(f'../output_data/{foldername}/raw_german_texts.json', 'r') as f:
    raw_german_texts = json.load(f)


In [20]:
# Prepare input lists. Extract the <body> sections from german_texts and english_texts.
german_body_contents = {}
for pageno in all_pagenos:
    content = german_texts[pageno] 
    try: 
        body = re.search(r'<body>(.*?)</body>', content, re.DOTALL).group(1) 
    except:
        body = ''
    german_body_contents[pageno] = body

english_body_contents = {}
for pageno in all_pagenos:
    content = english_texts[pageno] 
    try: 
        body = re.search(r'<body>(.*?)</body>', content, re.DOTALL).group(1) 
    except:
        body = ''
    english_body_contents[pageno] = body

# initialize output dicts
english_texts_2 = {}
outputs = {}
payloads = {}

In [50]:
def construct_payload_2(german_body_contents, 
                        english_body_contents, 
                        german_sentence_fragment_1, 
                        model_name, 
                        pageno, 
                        next_pageno):
    payload = {
        "model": model_name,
        "messages": [
          {
            "role": "system", 
            "content": "You are a World War II historian, who's bilingual in German and English "
              "You speak both languages with masterful efficiency and you're a professional translator from GERMAN to ENGLISH who "
              "stays loyal to both the style and the character of the original German text in your book translations."
          },
          {
            "role": "user",
            "content": [
              {
                  "type": "text",
                  "text": 
f"""
**Task Overview**
In the **Given DAta** section below, you are presented the translation of `<german_page_1>` into `<english_page_1>`. Your objective is to address any issues caused by sentences that span across `<german_page_1>` and `<german_page_2>`.

------------------

**Chain of Thought Reasoning**
1. Identify Potential Fragment 1 (The last sentence at the very bottom of `<german_page_1>`):
   - Extract the portion of German text at the end of `<german_page_1>` that may or may not be a complete sentence on its own.
   - This German piece of text is the candidate for `fragment_1`.
   - Output `fragment_1` inside `<fragment_1>...</fragment_1>` tags.

2. Identify Potential Fragment 2 (The first sentence at the very top of `<german_page_2>`):
   - Extract the portion of German text at the top of `<german_page_2>` that appears to complete the thought or grammatical structure of `fragment_1`.
   - This German piece of text is the candidate for `fragment_2`. Include **only** the text necessary to complete `fragment_1`.
   - Output `fragment_2` inside `<fragment_2>...</fragment_2>` tags.

3. Reasoning and Validation:
   - Compare `fragment_1` and `fragment_2`:
       - Does `fragment_2` logically and grammatically continue `fragment_1`?
       - Does combining the two fragments form a coherent sentence?
   - Ensure that `fragment_2` contains only the portion necessary to complete `fragment_1`.
   - Decide whether the fragments align to form a single coherent sentence.
   - Think out loud. Output your reasoning process and wrap your thoughts in `<thinking>...</thinking>` tags.

4. Combine Fragments (if valid):
   - If the fragments align and validation succeeds, combine them into a coherent, grammatically correct sentence.
   - If they do not align or either fragment is missing, conclude that there is no valid fragmentation.
   - Again, think out loud. Output your decision inside `<decision>...</decision>` tags.

------------------

**Output Requirements**
1. **Chain of Thought Tokens**:
   - Output your reasoning for each step inside `<thinking>...</thinking>` tags.
   - Ensure that the reasoning is clear and concise.

2. **Candidate Fragments**:
   - Fragment 1: Wrap this in `<fragment_1>...</fragment_1>` tags.
   - Fragment 2: Wrap this in `<fragment_2>...</fragment_2>` tags.

3. **Final Decision**:
   - Indicate whether the fragments align to form a coherent sentence inside `<decision>...</decision>` tags.
   - If the fragments do not align, explicitly state this in the `<decision>` tags.

4. **Final Combined Sentence**:
   Two Cases:
   - **Case 1**:
       - If the fragments align, wrap the combined sentence in `<english>...</english>` tags.
       - Include the validated German fragment from `<german_page_2>` in `<final_fragment_2>` tags.
   - **Case 2**:
       - If no valid fragmentation exists, output `<english_page_1>` unchanged.
       - Set `<final_fragment_2>` as empty.

------------------

**Example 1. Output for a Fragmented Sentence**

<thinking>
Step 1: Identify `fragment_1` from the bottom of `german_page_1`.
<fragment_1>Der Angriff begann früh am Morgen</fragment_1>

Step 2: Identify `fragment_2` from the top of `german_page_2`.
<fragment_2>des 10. Mai mit schwerem Artilleriefeuer.</fragment_2>

Step 3: Validate whether the fragments align:
   - `fragment_1` ends without proper punctuation, suggesting it is incomplete.
   - `fragment_2` begins with 'des 10. Mai,' which continues the context of time introduced in `fragment_1`.
   - Only the portion necessary to complete `fragment_1` was selected from `german_page_2` to form `fragment_2`.
   - Combining them forms a coherent sentence: 'Der Angriff begann früh am Morgen des 10. Mai mit schwerem Artilleriefeuer.'
<decision>The fragments align and form a coherent sentence.</decision>
</thinking>

<english>The attack began early in the morning on May 10th, with heavy artillery fire.</english>
<final_fragment_2>des 10. Mai mit schwerem Artilleriefeuer.</final_fragment_2>

------------------

**Example 2. Two Consecutive, Unfragmented Sentences**

<thinking>
Step 1: Identify `fragment_1` from the bottom of `german_page_1`.
<fragment_1>Jedenfalls ist der Umstand, daß sich General v. Falkenhayn in den überaus wichtigen Fragen des Einsatzes der Heeresreserve und der persönlichen Einflußnahme auf die Kriegsführung im Osten nicht durchsetzte, trotzdem aber in seiner Stellung als Chef des Generalstabes des Feldheeres verblieb, von folgenschwerer Bedeutung für sein ferneres Wirken gewesen.</fragment_1>

Step 2: Identify `fragment_2` from the top of `german_page_2`.
<fragment_2>I. Erwägungen und Maßnahmen der deutschen Obersten Heeresleitung.</fragment_2>

Step 3: Validate whether the fragments align:
   - `fragment_1` ends with a period, indicating it is a complete sentence.
   - `fragment_2` begins with "I. Erwägungen und Maßnahmen der deutschen Obersten Heeresleitung," which introduces a new topic unrelated to the context of `fragment_1`.
   - The fragments are independent and do not logically or grammatically connect into a single sentence.
<decision>The fragments do not align and do not form a single coherent sentence.</decision>
</thinking>

<english>In any case, the fact that General v. Falkenhayn did not prevail in the extremely important questions of the use of army reserves and personal influence on the conduct of the war in the East, but nevertheless remained in his position as Chief of the General Staff of the Field Army, was of momentous significance for his further work.</english>
<final_fragment_2></final_fragment_2>

------------------

**Given Data:**
<german_page_1>{german_body_contents[pageno]}</german_page_1>
<german_page_2>{german_body_contents[next_pageno]}</german_page_2>
<english_page_1>{english_body_contents[pageno]}</english_page_1>
<german_sentence_fragment_1>{german_sentence_fragment_1}</german_sentence_fragment_1>
"""
              },
            ]
          },
        ],
        "max_tokens": 6000,
        "temperature": 0.1
    }

    return payload 

In [52]:
import ipdb
import requests

def log_execution_time_synchronous(func: Callable):
    def wrapper(*args, **kwargs):
        start = time.time()
        result = func(*args, **kwargs)
        logger.info(f"Finished {func.__name__} in {time.time() - start:.2f} seconds.")
        return result
    return wrapper

def make_gpt_request_for_broken_sentences(model_name: str, headers: dict, payload: dict, pageno: str) -> dict:
    global english_texts_2
    response = requests.post(
        "https://api.openai.com/v1/chat/completions",
        json=payload,
        headers=headers
    )
    result = response.json()
    content = result['choices'][0]['message']['content']
    english_texts_2[pageno] = re.search(r'<english>(.*?)</english>', content, re.DOTALL).group(1)
    return content

@log_execution_time_synchronous
def main_broken_sentences(model_name, headers):
    global payloads, outputs
    fragment_2 = ''
    for i in range(len(all_pagenos)):
        pageno = all_pagenos[i]
        next_pageno = all_pagenos[i+1]
        if pageno in english_texts_2.keys():
            logger.info(f'Skipping pageno: {pageno}')
            continue
        try:
            logger.info(f"Processing i:{i},  pageno: {pageno}")
            payload = construct_payload_2(german_body_contents, english_body_contents, fragment_2, "gpt-4o-2024-08-06", pageno, next_pageno)
            payloads[pageno] = payload['messages'][1]['content'][0]['text']
            content = make_gpt_request_for_broken_sentences(model_name, headers, payload, pageno)
            outputs[pageno] = content
            fragment_2 = re.search(r'<final_fragment_2>(.*?)</final_fragment_2>', content, re.DOTALL).group(1)
            logger.info(f"next round's fragment_2: {fragment_2}") 
        except Exception as e:
            logger.error(f"Error processing pageno {pageno}: {e}")
            ipdb.set_trace()
            pass

    return payloads, outputs

english_texts_2 = {}
payloads, outputs = main_broken_sentences("gpt-4o-2024-08-06", headers)


2024-12-29 01:05:55,898 - INFO - Processing i:0,  pageno: 001
2024-12-29 01:06:01,631 - INFO - next round's fragment_2: 
2024-12-29 01:06:01,633 - INFO - Processing i:1,  pageno: 002
2024-12-29 01:06:05,826 - INFO - next round's fragment_2: 
2024-12-29 01:06:05,828 - INFO - Processing i:2,  pageno: 003
2024-12-29 01:06:21,916 - INFO - next round's fragment_2: 
2024-12-29 01:06:21,918 - INFO - Processing i:3,  pageno: 004
2024-12-29 01:06:26,408 - INFO - next round's fragment_2: 
2024-12-29 01:06:26,410 - INFO - Processing i:4,  pageno: 005
2024-12-29 01:06:36,648 - INFO - next round's fragment_2: 
2024-12-29 01:06:36,650 - INFO - Processing i:5,  pageno: 006
2024-12-29 01:06:45,763 - INFO - next round's fragment_2: 
2024-12-29 01:06:45,765 - INFO - Processing i:6,  pageno: 007
2024-12-29 01:06:55,695 - INFO - next round's fragment_2: 
2024-12-29 01:06:55,697 - INFO - Processing i:7,  pageno: 008
2024-12-29 01:07:06,858 - INFO - next round's fragment_2: 
2024-12-29 01:07:06,860 - INFO -

KeyboardInterrupt: 

In [47]:
body = re.search(r'<body>(.*?)</body>', german_texts['006'], re.DOTALL).group(1) 
body
german_body_contents['006']
german_texts['006']
print(german_body_contents['006'])



3. Die Winterschlacht in Masuren . . . . . . . . . . . . . . . . . 172
[Detailed subsections a-h with page numbers]
4. Schutz der rechten Flanke der 10. Armee . . . . . . . . . . . . 243
[Subsections a-b]
5. Neuer Entschluß des Oberbefehlshabers Ost (22. bis 27. Februar 1915) 257
[Sections 6-10]
Wechselnde Pläne des Generals v. Falkenhayn . . . . . . . . . . . 301
Die Schaffung einer neuen Heeresreserve . . . . . . . . . . . . . 301
Erwägungen für einen kriegsentscheidenden Durchbruch im Westen . . 307
Die politische Lage und ihr Einfluß auf die militärischen Entschließungen . . . . . . . . . . . . . . . . . . . . . . . 323



```
2024-12-28 19:32:18,093 - INFO - Processing i:0,  pageno: 001
2024-12-28 19:32:22,620 - INFO - next round's final_fragment_2: 
2024-12-28 19:32:22,622 - INFO - Processing i:1,  pageno: 002
2024-12-28 19:32:25,996 - INFO - next round's final_fragment_2: 
2024-12-28 19:32:25,998 - INFO - Processing i:2,  pageno: 003
2024-12-28 19:32:33,684 - INFO - next round's final_fragment_2: 
2024-12-28 19:32:33,686 - INFO - Processing i:3,  pageno: 004
2024-12-28 19:32:37,773 - INFO - next round's final_fragment_2: 
2024-12-28 19:32:37,775 - INFO - Processing i:4,  pageno: 005
2024-12-28 19:32:49,039 - INFO - next round's final_fragment_2: 
2024-12-28 19:32:49,041 - INFO - Processing i:5,  pageno: 006
2024-12-28 19:32:57,111 - ERROR - Error processing pageno 006: 'NoneType' object has no attribute 'group'
2024-12-28 19:32:57,113 - INFO - Processing i:6,  pageno: 007
2024-12-28 19:33:04,234 - INFO - next round's final_fragment_2: 
2024-12-28 19:33:04,236 - INFO - Processing i:7,  pageno: 008
2024-12-28 19:33:13,204 - INFO - next round's final_fragment_2: 
2024-12-28 19:33:13,206 - INFO - Processing i:8,  pageno: 009
2024-12-28 19:33:25,822 - INFO - next round's final_fragment_2: 
2024-12-28 19:33:25,825 - INFO - Processing i:9,  pageno: 010
2024-12-28 19:33:43,241 - INFO - next round's final_fragment_2: 
2024-12-28 19:33:43,244 - INFO - Processing i:10,  pageno: 011
2024-12-28 19:33:50,170 - ERROR - Error processing pageno 011: 'NoneType' object has no attribute 'group'
2024-12-28 19:33:50,172 - INFO - Processing i:11,  pageno: 012
2024-12-28 19:33:53,184 - INFO - next round's final_fragment_2: 
2024-12-28 19:33:53,186 - INFO - Processing i:12,  pageno: 013
2024-12-28 19:34:01,060 - INFO - next round's final_fragment_2: 
2024-12-28 19:34:01,062 - INFO - Processing i:13,  pageno: 014
2024-12-28 19:34:10,548 - INFO - next round's final_fragment_2: 
2024-12-28 19:34:10,550 - INFO - Processing i:14,  pageno: 015
2024-12-28 19:34:17,224 - INFO - next round's final_fragment_2: 5. Armee, Generalmajor Schmidt v. Knobelsdorf, gewandt mit dem Ersuchen um Vorlage von Operationsentwürfen für eine neue Offensive im Westen, bei denen zur Voraussetzung gemacht war, daß „außer den in der Front stehenden und zum Halten derselben nötigen Truppen sechs Armeekorps mit reichlicher Munition zum Einsatz an beliebiger Stelle verfügbar sein würden“.
2024-12-28 19:34:17,226 - INFO - Processing i:15,  pageno: 016
2024-12-28 19:34:21,635 - INFO - next round's final_fragment_2: Durchhalten Österreich-Ungarns und der Türkei sowie die Entwicklung der Dinge auf dem Balkan abhängig.
2024-12-28 19:34:21,636 - INFO - Processing i:16,  pageno: 017
2024-12-28 19:34:27,025 - INFO - next round's final_fragment_2: der Infanterie Freiherrn v. Lyncker, in entgegengesetztem Sinne beraten war), lehnte indessen den Antrag des Reichskanzlers ab.
2024-12-28 19:34:27,027 - INFO - Processing i:17,  pageno: 018
2024-12-28 19:34:36,183 - INFO - next round's final_fragment_2: 
2024-12-28 19:34:36,185 - INFO - Processing i:18,  pageno: 019
2024-12-28 19:34:43,345 - INFO - next round's final_fragment_2: standpunkt des Verbündeten geltend gemacht: „Niemals dürfen ihm jedoch die österreichisch-ungarischen Armeen unterstellt werden, das wäre nicht nur aus nationalen und dynastischen, sondern auch aus politischen und operativen Gründen ganz unzulässig. Wir würden dann jede Freiheit des Handelns verlieren und wären auf Gnade und Ungnade ausgeliefert.“
2024-12-28 19:34:43,347 - INFO - Processing i:19,  pageno: 020
2024-12-28 19:34:49,391 - INFO - next round's final_fragment_2: Diese Darlegungen riefen den lebhaften Widerspruch des Generals v. Conrad hervor: „An Befriedigung der Wünsche Italiens“ — so drahtete er am 7. Januar nach Mézières zurück — „und gar in weitgehendem Maße ist nicht zu denken.
2024-12-28 19:34:49,392 - INFO - Processing i:20,  pageno: 021
2024-12-28 19:34:58,706 - INFO - next round's final_fragment_2: 
2024-12-28 19:34:58,708 - INFO - Processing i:21,  pageno: 022
2024-12-28 19:35:03,108 - INFO - next round's final_fragment_2: Oberbefehlshaber Ost auf Grund seiner kriegerischen Erfolge im Heere wie im Volke genoss,
2024-12-28 19:35:03,110 - INFO - Processing i:22,  pageno: 023
2024-12-28 19:35:07,920 - INFO - next round's final_fragment_2: meines Stabes bestimmt ist, glaube ich folgern zu dürfen, daß die Abkommandierung des Generalleutnants nur eine vorübergehende sein soll.
2024-12-28 19:35:07,921 - INFO - Processing i:23,  pageno: 024
2024-12-28 19:35:12,538 - INFO - next round's final_fragment_2: gen an Italien bereitfinden müsse.
2024-12-28 19:35:12,540 - INFO - Processing i:24,  pageno: 025
2024-12-28 19:35:16,128 - INFO - next round's final_fragment_2: gehend behoben werden.
2024-12-28 19:35:16,130 - INFO - Processing i:25,  pageno: 026
2024-12-28 19:35:23,518 - INFO - next round's final_fragment_2: in Ostpreußen nur „größere örtliche Erfolge“ im Gegensatz zu den Führern im Osten, die von diesen Operationen bei genügend starkem Krafteinsatz eine entscheidenden Umschwingung der Kriegslage erhofften.
2024-12-28 19:35:23,520 - INFO - Processing i:26,  pageno: 027
2024-12-28 19:35:27,573 - INFO - next round's final_fragment_2: 
2024-12-28 19:35:27,575 - INFO - Processing i:27,  pageno: 028
2024-12-28 19:35:32,498 - INFO - next round's final_fragment_2: v. Falkenhayn aufs neue zu Sicherungsmaßnahmen Anlaß gab.
2024-12-28 19:35:32,500 - INFO - Processing i:28,  pageno: 029
2024-12-28 19:35:39,892 - INFO - next round's final_fragment_2: Gründen das Zurücknehmen einer oder mehrerer Armeen in eine weiter rückwärts liegende Stellung notwendig werden, so werden rechtzeitig die hierfür erforderlichen Anweisungen von der Obersten Heeresleitung ergehen; für den Ausbau einer rückwärtigen Stellung seitens der Armee in dem hier gedachten Sinne kommt dieser Fall nicht in Betracht.
2024-12-28 19:35:39,894 - INFO - Processing i:29,  pageno: 030
2024-12-28 19:35:48,984 - INFO - next round's final_fragment_2: weit seine Steigerung bei ernsten Angriffsoperationen die Infanterie zwingen werde, sich durch tieferes Eingraben in neuartigen Anlagen oder durch größere Verteilung der Deckungen nach rückwärts zu schützen, mußte der Truppenerfahrung überlassen bleiben.
2024-12-28 19:35:48,985 - INFO - Finished main_broken_sentences in 210.89 seconds.
```

---
## Experiments