In [1]:
%%time
%load_ext autoreload
%autoreload 2

# Standard Python modules
import time 
import numpy as np
import os
import sys
import ipdb
import requests

sys.path.append(os.path.abspath(".."))
import json
import re
import glob
import asyncio
import aiohttp
import openai
from typing import Dict, Tuple, List, Callable
from dotenv import load_dotenv
from pathlib import Path
import PyPDF2

# Project imports
from src.utils import (timeit, encode_image, plt, pylab, find_bad_pagenos, 
                    delete_bad_pagenos, dump_fragmented_output_to_json, get_corresponding_bad_fnames,
                    load_output_from_json, logging_for_main, dump_output_to_json, log_execution_time, count_num_tokens)
from src.processing import compute_log_spectrum_1d, extract_image_bbox, save_images
from src.api_requests_gpt import construct_payload_for_gpt, process_single_page
from src.api_requests_claude import construct_payload_for_claude
from src.document_generation import save_document, setup_logger, chapter_splitter

# Display and plotting, set notebook display width
from IPython.display import display, HTML, clear_output

display(HTML("<style>.container { width:90% !important; }</style>"))

# Print Python environment info
print('sys.executable:', sys.executable)
print('sys.version:', sys.version, '\n')

# Load environment variables from .env file
env_path = Path('../.env')  # Adjust path if needed
load_dotenv(dotenv_path=env_path)

# OpenAI API setup
openai.api_key = os.getenv("OPENAI_API_KEY")

# Setup for PDF processing
foldername = "Der Weltkrieg v8"

# # split the pdf into separate pages
chapter_splitter(input_pdf_path=f"../input_data/{foldername} East Front.pdf", 
                 output_folder=f"../input_data/{foldername}")

chapter_splitter - L31 - INFO - Extracted 678 pages from PDF


sys.executable: /Users/ozkansafak/code/fraktur/.venv/bin/python3
sys.version: 3.10.9 (main, Mar  1 2023, 12:20:14) [Clang 14.0.6 ] 



chapter_splitter - L45 - INFO - All pages have been split and saved to ../input_data/Der Weltkrieg v8


CPU times: user 1.59 s, sys: 2.25 s, total: 3.83 s
Wall time: 1.48 s


In [2]:
# Initialize variables
plotter = False
image_path = f"../input_data/{foldername}/*pdf"
fnames = sorted(glob.glob(image_path))
all_pagenos = [re.search(r'page_(.*?)\.pdf', fname, re.DOTALL).group(1) for fname in fnames]

# Storage for processed texts
raw_german_texts: Dict[str, str] = {}
german_texts: Dict[str, str] = {}
english_texts: Dict[str, str] = {}

# Configure logging
logger = setup_logger('time_logger')

---
## PART 1
## Fraktur Translator (GPT-4o)

In [3]:
# raw_german_texts, german_texts, english_texts, _ = load_output_from_json(foldername) 

In [4]:
@log_execution_time
async def main(fnames, model_name="", semaphore_count=10, extract=True):
    semaphore = asyncio.Semaphore(semaphore_count)  # Adjust number based on API limits
    async def wrapper_process_page(fname: str) -> Tuple[str, Dict]:
        global raw_german_texts, german_texts, english_texts  # Explicitly declare globals
        async with semaphore:
            pageno = re.search(r'page_(.*?)\.pdf', fname, re.DOTALL).group(1)
            result = await process_single_page(fname, model_name, plotter, pageno, extract) 
            return pageno, result
    
    # A list of coroutine objects. include only the unprocessed pages.
    tasks = []
    print(f'Composing tasks to be executed with asyncio, semaphore_count = {semaphore_count} ')
    for fname in fnames:
        pageno = re.search(r'page_(.*?)\.pdf', fname, re.DOTALL).group(1)
        if pageno not in set(raw_german_texts.keys()):
            print(f'{pageno} ', end='')
            tasks.append(wrapper_process_page(fname))
    print(f"main: len(tasks): {len(tasks)} -- Processing tasks as they complete")
    
    # Process tasks as they complete
    for i, task in enumerate(asyncio.as_completed(tasks)):
        try:
            pageno, (content, token_count, raw_german_text, german_text, english_text) = await task
            raw_german_texts[pageno] = raw_german_text
            german_texts[pageno] = german_text
            english_texts[pageno] = english_text
            
            logging_for_main(i, tasks, pageno, token_count, raw_german_text, german_text, english_text)
        except Exception as e:
            logger.error(f"{i} of {len(tasks)-1} - Error processing a task: {e}. token_count:{token_count}")
    return 

# Run the async code
await main(fnames, model_name="gpt-4o-2024-08-06", semaphore_count=10)


Composing tasks to be executed with asyncio, semaphore_count = 10 
001 002 003 004 005 006 007 008 009 010 011 012 013 014 015 016 017 018 019 020 021 022 023 024 025 026 027 028 029 030 031 032 033 034 035 036 037 038 039 040 041 042 043 044 045 046 047 048 049 050 051 052 053 054 055 056 057 058 059 060 061 062 063 064 065 066 067 068 069 070 071 072 073 074 075 076 077 078 079 080 081 082 083 084 085 086 087 088 089 090 091 092 093 094 095 096 097 098 099 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 2

logging_for_main - L259 - INFO - 0 of 677 -- Successfully processed pageno:437. token_count:2076
logging_for_main - L259 - INFO - 1 of 677 -- Successfully processed pageno:335. token_count:1975
logging_for_main - L259 - INFO - 2 of 677 -- Successfully processed pageno:333. token_count:2083
logging_for_main - L259 - INFO - 3 of 677 -- Successfully processed pageno:137. token_count:2175
logging_for_main - L259 - INFO - 4 of 677 -- Successfully processed pageno:520. token_count:2031
logging_for_main - L259 - INFO - 5 of 677 -- Successfully processed pageno:194. token_count:2040
logging_for_main - L259 - INFO - 6 of 677 -- Successfully processed pageno:660. token_count:3257
logging_for_main - L259 - INFO - 7 of 677 -- Successfully processed pageno:196. token_count:2112
logging_for_main - L259 - INFO - 8 of 677 -- Successfully processed pageno:659. token_count:3239
logging_for_main - L259 - INFO - 9 of 677 -- Successfully processed pageno:181. token_count:2041
logging_for_main - L259 - INFO

pageno:170[997]—Glinik line on May 16th and 17th. Simultaneously, the enemy, using relentless manpower, assaulted the bridgehead of Kolomea, whose now reinforced garrison held firm. However, the Russians succeeded on May 19th in breaking into the trenches on the southern Pruth bank at Kumaniczy, from which they were soon expelled. Then a longer pause in fighting occurred on the right flank of the 7th Army. General Freiherr von Pflanzer-Baltin refrained from resuming the offensive, considering the strength of the opposing enemy.</body>

------------------------------------------------------------------


logging_for_main - L259 - INFO - 195 of 677 -- Successfully processed pageno:616. token_count:1752
logging_for_main - L259 - INFO - 196 of 677 -- Successfully processed pageno:570. token_count:1715
logging_for_main - L259 - INFO - 197 of 677 -- Successfully processed pageno:647. token_count:2400
logging_for_main - L259 - INFO - 198 of 677 -- Successfully processed pageno:428. token_count:1881
logging_for_main - L259 - INFO - 199 of 677 -- Successfully processed pageno:427. token_count:2004
logging_for_main - L259 - INFO - 200 of 677 -- Successfully processed pageno:606. token_count:1859
logging_for_main - L259 - INFO - 201 of 677 -- Successfully processed pageno:265. token_count:2025
logging_for_main - L259 - INFO - 202 of 677 -- Successfully processed pageno:605. token_count:1212
extract_text_section - L243 - ERROR - pageno: 424, section name:"raw_german" was not found. token_count=5000
extract_text_section - L243 - ERROR - pageno: 424, section name:"german" was not found. token_count

pageno:511[of the 80th R. D.], 14th Ldw. D. and Abt. Etschel), Suter Group (XXI. A. K. [31st and 42nd S. D., 115th I. D., 77th R. D., Div. Zenker]), Eben Group (Gen. Kdo. I. A. K. with 2nd Ldw. D., 58th and 4th S. D. and 4th R. D.), Garnier Cavalry Corps (1st and 9th R. D., assigned 3rd R. D. of the Njemen Army). — 6th Ldw. Br. and Brig. Monteton had the strength of a division.
³) p. 535.</footer>

------------------------------------------------------------------


logging_for_main - L259 - INFO - 502 of 677 -- Successfully processed pageno:053. token_count:2012
logging_for_main - L259 - INFO - 503 of 677 -- Successfully processed pageno:244. token_count:2016
logging_for_main - L259 - INFO - 504 of 677 -- Successfully processed pageno:099. token_count:1893
logging_for_main - L259 - INFO - 505 of 677 -- Successfully processed pageno:517. token_count:2140
logging_for_main - L259 - INFO - 506 of 677 -- Successfully processed pageno:544. token_count:1830
logging_for_main - L259 - INFO - 507 of 677 -- Successfully processed pageno:100. token_count:1972
logging_for_main - L259 - INFO - 508 of 677 -- Successfully processed pageno:097. token_count:1904
logging_for_main - L259 - INFO - 509 of 677 -- Successfully processed pageno:536. token_count:1654
logging_for_main - L259 - INFO - 510 of 677 -- Successfully processed pageno:342. token_count:1941
logging_for_main - L259 - INFO - 511 of 677 -- Successfully processed pageno:359. token_count:1946
logging_fo

pageno:559[Sommer]) and 17th Ldw. D. (Abt. Eisebed) were unnamed or newly formed.
</footer>

------------------------------------------------------------------


logging_for_main - L259 - INFO - 545 of 677 -- Successfully processed pageno:494. token_count:2028
logging_for_main - L259 - INFO - 546 of 677 -- Successfully processed pageno:114. token_count:1083
logging_for_main - L259 - INFO - 547 of 677 -- Successfully processed pageno:549. token_count:1924
logging_for_main - L259 - INFO - 548 of 677 -- Successfully processed pageno:020. token_count:1830
logging_for_main - L259 - INFO - 549 of 677 -- Successfully processed pageno:125. token_count:1974
logging_for_main - L259 - INFO - 550 of 677 -- Successfully processed pageno:316. token_count:1812
logging_for_main - L259 - INFO - 551 of 677 -- Successfully processed pageno:540. token_count:2015
logging_for_main - L259 - INFO - 552 of 677 -- Successfully processed pageno:021. token_count:1883
logging_for_main - L259 - INFO - 553 of 677 -- Successfully processed pageno:330. token_count:2065
logging_for_main - L259 - INFO - 554 of 677 -- Successfully processed pageno:360. token_count:1880
logging_fo

In [14]:
## V5
good_pagenos = {}
good_pagenos['Der Weltkrieg v5'] = {'213', '225', '221'}
good_pagenos['Der Weltkrieg v6'] = {'377', '378', '379'}

## V7 ('417' and '429' does not get OCRed by GPT-4o, and it gets summarized by Claude)
good_pagenos['Der Weltkrieg v7'] = {'265', '407', '409', '410', '412', '418', '420', '421', '423', '424', '428'} 

## v10
good_pagenos['Der Weltkrieg v10'] = {'083'}
good_pagenos['Der Weltkrieg v11'] = {}
good_pagenos['Der Weltkrieg v2'] = {'246','247'}

## V8 
good_pagenos['Der Weltkrieg v8'] = {'170', '511', '559', '678'}

bad_pagenos = find_bad_pagenos(all_pagenos, raw_german_texts, german_texts, english_texts, good_pagenos[foldername])
# delete_bad_pagenos(bad_pagenos, raw_german_texts, german_texts, english_texts)


> pageno: 004. `raw_german_texts` pageno: 004, "<raw_german>" section was not found

> pageno: 006. `raw_german_texts` pageno: 006, "<raw_german>" section was not found

> pageno: 007. `raw_german_texts` pageno: 007, "<raw_german>" section was not found

> pageno: 008. `raw_german_texts` pageno: 008, "<raw_german>" section was not found

> pageno: 014. `raw_german_texts` pageno: 014, "<raw_german>" section was not found

> pageno: 334. `raw_german_texts` pageno: 334, "<raw_german>" section was not found

> pageno: 424. `raw_german_texts` pageno: 424, "<raw_german>" section was not found

> pageno: 630. `raw_german_texts` pageno: 630, "<raw_german>" section was not found

> pageno: 651. `english_texts` pageno: 651, "<english>" section was not found

> pageno: 656. `english_texts` pageno: 656, "<english>" section was not found

> pageno: 670. `raw_german_texts` pageno: 670, "<raw_german>" section was not found

> pageno: 678. `raw_german_texts` pageno: 678, "<raw_german>" section was no

---
## PART 2
##  Handle missing keys (Claude Sonnet)

In [51]:
# Check for unprocessed pages and print them to stdout.
# ------------------------------------------------------------------------------------
# 1. Rerun the missing pages on Claude without performing FFT based extraction. 
# ------------------------------------------------------------------------------------

# Get missing keys based on empty raw_german_texts
bad_pagenos = find_bad_pagenos(all_pagenos, raw_german_texts, german_texts, english_texts, good_pagenos[foldername])
delete_bad_pagenos(bad_pagenos, raw_german_texts, german_texts, english_texts)
await main(fnames, model_name="claude-3-5-sonnet-20241022", semaphore_count=1, extract=False)

# ------------------------------------------------------------------------------------
# 2. If there still are missing pages, run them performing FFT based extraction. 
#    This time compute missing_keys based on 'english_texts'.
# ------------------------------------------------------------------------------------

# Now get missing keys based on empty english_texts
bad_pagenos = find_bad_pagenos(all_pagenos, raw_german_texts, german_texts, english_texts, good_pagenos[foldername])
delete_bad_pagenos(bad_pagenos, raw_german_texts, german_texts, english_texts)
await main(fnames, model_name="claude-3-5-sonnet-20241022", semaphore_count=1, extract=True)

bad_pagenos = find_bad_pagenos(all_pagenos, raw_german_texts, german_texts, english_texts, good_pagenos[foldername])
delete_bad_pagenos(bad_pagenos, raw_german_texts, german_texts, english_texts)
await main(fnames, model_name="claude-3-5-sonnet-20241022", semaphore_count=1, extract=False)

bad_pagenos = find_bad_pagenos(all_pagenos, raw_german_texts, german_texts, english_texts, good_pagenos[foldername])



> pageno: 651. `english_texts` [Continued translation of all entries maintaining the same format and structure as the original, including all page numbers and military ranks/positions]
</body>


> pageno: 656. `raw_german_texts` [continued transcription of all entries through to "von Mackenſen"]


> pageno: 670. `raw_german_texts` [Content continues with detailed listings for 3. Armee through Gruppe Mittel]
Weltkrieg. VIII. Band.                           43


bad_pagenos (3): ['651', '656', '670']
------------------------------------------------------------------
> Deleting pageno: 651. Please run `await main(fnames, model_name, semaphore_count)` again 
> Deleting pageno: 656. Please run `await main(fnames, model_name, semaphore_count)` again 
> Deleting pageno: 670. Please run `await main(fnames, model_name, semaphore_count)` again 
Composing tasks to be executed with asyncio, semaphore_count = 1 
651 656 670 main: len(tasks): 3 -- Processing tasks as they complete


logging_for_main - L252 - ERROR - main pageno:651. '[' present in english_text. token_count:1600


pageno:651[Continuing with faithful translations of all entries, maintaining the same structure and reference numbers as in the original]
</body>

------------------------------------------------------------------


extract_text_section - L243 - ERROR - pageno: 670, section name:"raw_german" was not found. token_count=714
logging_for_main - L252 - ERROR - main pageno:670. '[' present in english_text. token_count:714


pageno:670[Content continues with detailed listings for 3rd Army through Group Mittel]
World War. Volume VIII.
</body>
<pageno>43</pageno>

------------------------------------------------------------------


logging_for_main - L252 - ERROR - main pageno:656. '[' present in english_text. token_count:1711
wrapper - L88 - INFO - Finished main in 88.28 seconds.


pageno:656[Continued translation of the entire index with all military ranks, positions, and page numbers maintained as in the original]
</body>

------------------------------------------------------------------

> pageno: 651. `english_texts` [Continuing with faithful translations of all entries, maintaining the same structure and reference numbers as in the original]
</body>


> pageno: 656. `english_texts` [Continued translation of the entire index with all military ranks, positions, and page numbers maintained as in the original]
</body>


> pageno: 670. `english_texts` [Content continues with detailed listings for 3rd Army through Group Mittel]
World War. Volume VIII.
</body>
<pageno>43</pageno>


bad_pagenos (3): ['651', '656', '670']
------------------------------------------------------------------
> Deleting pageno: 651. Please run `await main(fnames, model_name, semaphore_count)` again 
> Deleting pageno: 656. Please run `await main(fnames, model_name, semaphore_count)` agai

logging_for_main - L252 - ERROR - main pageno:651. '[' present in english_text. token_count:1381


pageno:651[Continued on right column...]
</body>

------------------------------------------------------------------


logging_for_main - L259 - INFO - 1 of 2 -- Successfully processed pageno:656. token_count:2042
logging_for_main - L252 - ERROR - main pageno:670. '[' present in english_text. token_count:923
wrapper - L88 - INFO - Finished main in 73.00 seconds.


pageno:670[Content continues with detailed listing of military units and their corresponding numbers...]
World War. Volume VIII.               43
</body>

------------------------------------------------------------------

> pageno: 651. `raw_german_texts` [Continued on right column...]


> pageno: 656. `raw_german_texts` [Continued in next part due to length...]


> pageno: 670. `raw_german_texts` [Content continues with detailed listing of military units and their corresponding numbers...]
Weltkrieg. VIII. Band.                43


bad_pagenos (3): ['651', '656', '670']
------------------------------------------------------------------
> Deleting pageno: 651. Please run `await main(fnames, model_name, semaphore_count)` again 
> Deleting pageno: 656. Please run `await main(fnames, model_name, semaphore_count)` again 
> Deleting pageno: 670. Please run `await main(fnames, model_name, semaphore_count)` again 
Composing tasks to be executed with asyncio, semaphore_count = 1 
651 656 670 

logging_for_main - L252 - ERROR - main pageno:651. '[' present in english_text. token_count:1647


pageno:651[Continuing with faithful translations of all entries, maintaining all military ranks, unit designations, and page numbers as in the original]
</body>

------------------------------------------------------------------


logging_for_main - L252 - ERROR - main pageno:670. '[' present in english_text. token_count:1492


pageno:670[Content continues with same format through all army listings and corps]</body>
<footer>World War. Volume VIII.                43</footer>

------------------------------------------------------------------


logging_for_main - L252 - ERROR - main pageno:656. '[' present in english_text. token_count:1734
wrapper - L88 - INFO - Finished main in 100.34 seconds.


pageno:656[Continuing with all entries in the same detailed manner, maintaining all military ranks, unit designations, and page references as in the original]
</body>

------------------------------------------------------------------

> pageno: 651. `english_texts` [Continuing with faithful translations of all entries, maintaining all military ranks, unit designations, and page numbers as in the original]
</body>


> pageno: 656. `english_texts` [Continuing with all entries in the same detailed manner, maintaining all military ranks, unit designations, and page references as in the original]
</body>


> pageno: 670. `raw_german_texts` [Continued content follows same format through XIX. Korps and various other Korps and Gruppe listings]
Weltkrieg. VIII. Band.                43


bad_pagenos (3): ['651', '656', '670']


In [61]:
# save json outputs
dump_output_to_json(foldername, raw_german_texts, german_texts, english_texts)

# Save json files and .docx files.
doc1, fname1 = save_document(german_texts, foldername, language=f'{foldername} - German')
doc2, fname2 = save_document(english_texts, foldername, language=f'{foldername} - English')



dumped to ../output_data/Der Weltkrieg v8/*json files.


save_document - L197 - INFO - saved to "../output_data/Der Weltkrieg v8/Der Weltkrieg v8 - German"
save_document - L197 - INFO - saved to "../output_data/Der Weltkrieg v8/Der Weltkrieg v8 - English"


``` 
1.  Upload Input folder of pdfs to blob storage.
2.  Read file from s3.
3.  FFT in y -> (x_hi, x_lo), write half_cropped_image to s3
4.  FFT in x -> (y_hi, y_lo), write cropped_image to s3
5.  Read cropped image from s3 -> encode_image -> translate and transcribe -> JSON output

```

### Available models and pricing:
```
"gpt-4o-2024-08-06":
    "price_txt": "$2.50 / 1M input tokens"
    "price_img": "$0.001913 / 1500px^2"
    
"gpt-4o-mini-2024-07-18":
    "price_txt": "$0.150 / 1M input tokens"
    "price_img": "$0.003825 / 1500px^2"
    
```

---
## PART 3
## Load the German text and translate fragmented sentences.

In [62]:
# initialize output dicts
english_texts_defragmented = {}
contents = {}
payloads = {}
fragments_2 = {None:'', all_pagenos[-1]:''}

# raw_german_texts, german_texts, english_texts, english_texts_defragmented = load_output_from_json(foldername, load_defrag=True)

In [211]:
from src.utils import log_execution_time_synchronous
from src.api_requests_gpt import make_gpt_request_for_broken_sentences, construct_gpt_payload_fragmented_sentences
from src.api_requests_claude import make_claude_request_for_broken_sentences, construct_claude_payload_fragmented_sentences


@log_execution_time_synchronous
def main_broken_sentences(model_name):
    global payloads, contents, fragments_2
    for i in range(len(all_pagenos) - 1):
        pageno = all_pagenos[i]
        prev_pageno = all_pagenos[i-1] if i > 0 else None
        next_pageno = all_pagenos[i+1]
        if pageno in english_texts_defragmented.keys() and len(english_texts_defragmented[pageno]) > 10:
            logger.info(f'pageno:{pageno} already in `english_texts_defragmented`... skipping')
            continue
        
        for trial in [1,2,3]:
            try:
                logger.info(f"Processing pageno: {pageno} {' ...trying again '+ str(trial) if trial > 1 else ''}")
                time.sleep(.1)
                if model_name.startswith('gpt'):
                    construct_payload = construct_gpt_payload_fragmented_sentences
                    make_llm_request = make_gpt_request_for_broken_sentences
                else:
                    construct_payload = construct_claude_payload_fragmented_sentences
                    make_llm_request = make_claude_request_for_broken_sentences

                payload = construct_payload(
                        german_page_1=german_texts[pageno],
                        german_page_2=german_texts[next_pageno],
                        english_page_1_old_input=english_texts[pageno],
                        german_page_1_top_fragment_to_be_ignored=fragments_2[prev_pageno] # this is in German 
                )
                payloads[pageno] = payload['messages'][1]['content'][0]['text']
                content = make_llm_request(payload, pageno, english_texts_defragmented)
                contents[pageno] = content
                fragments_2[pageno] = re.search(r'<fragment_2>(.*?)</fragment_2>', content, re.DOTALL).group(1)
                if fragments_2[pageno].count('\n') > 10:
                    print('not accepting fragment_2. fragments_2[pageno].count("\\n")', fragments_2[pageno].count('\n'))
                    fragments_2[pageno] = '' 
                logger.info(f"pageno:{pageno} next round's `german_page_1_top_fragment_to_be_ignored`: {fragments_2[pageno]}") 
                break
            except Exception as e:
                logger.error(f"Error processing. {e}. trial:{trial}, pageno: {pageno}")
                if trial == 3:
                    english_texts_defragmented[pageno] = ''

        if pageno[-1] == '0' or pageno == all_pagenos[-2]:
            dump_fragmented_output_to_json(foldername, english_texts_defragmented, fragments_2, contents)

    english_texts_defragmented[next_pageno] = english_texts[next_pageno]

main_broken_sentences(model_name='gpt') 

In [227]:
bad_pagenos = []
for pageno in all_pagenos:
    if '<body>' not in english_texts_defragmented[pageno] and '</body>' not in english_texts_defragmented[pageno]:
        bad_pagenos.append(pageno)
print(f"{bad_pagenos}: ({len(bad_pagenos)})")

for pageno in bad_pagenos:
    del english_texts_defragmented[pageno]
    
main_broken_sentences(model_name='gpt') 


[]: (0)


In [213]:
# if no fragmented pages then copy english_texts -> english_texts_defragmented.

for pageno in all_pagenos:
    if not fragments_2[pageno]:
        english_texts_defragmented[pageno] = english_texts[pageno] 

In [229]:
dump_fragmented_output_to_json(foldername, english_texts_defragmented, fragments_2, contents)

doc1, fname1 = save_document(german_texts, foldername, language=f'{foldername} - German_defragmented')
doc2, fname2 = save_document(english_texts_defragmented, foldername, language=f'{foldername} - English_defragmented')


dumped fragmented outputs foldername: Der Weltkrieg v8.


save_document - L197 - INFO - saved to "../output_data/Der Weltkrieg v8/Der Weltkrieg v8 - German_defragmented"
save_document - L197 - INFO - saved to "../output_data/Der Weltkrieg v8/Der Weltkrieg v8 - English_defragmented"


---

## PART 4
## Fix Broken Paragraphs

In [230]:
from src.constants import *
# print(CARRIAGE_RETURN_SYSTEM_PROMPT)
# print()
# print(CARRIAGE_RETURN_USER_PROMPT)

def construct_payload_for_gpt_slash_n(english_text_defragmented) -> dict:
    model_name = "gpt-4o-2024-08-06"
    payload = {
        "model": model_name,
        "messages": [
          {
            "role": "system", 
            "content": CARRIAGE_RETURN_SYSTEM_PROMPT
          },
          {
            "role": "user",
            "content": [
              {
                "type": "text",
                "text": CARRIAGE_RETURN_USER_PROMPT.format(
                    english_text_defragmented=english_text_defragmented
                )},
            ]
          }
        ],
        "max_tokens": 5000,
        "temperature": 0.1
    }
    return payload


In [231]:
problematic = {}
for pageno in all_pagenos:
    text = english_texts_defragmented[pageno]
    
    n_count = text.count('\n')
    if n_count > 15:
        problematic[pageno] = text

print('len(problematic)', len(problematic))

for pageno in problematic:
    print('pageno:', pageno)
    print(problematic[pageno])

len(problematic) 161
pageno: 004

<header>Table of Contents.
The Operations of the Year 1915.
The Events in the West in Spring and Summer,
in the East from Spring until the End of the Year.</header>
<body>                                                                Page
I. The Situation of the Central Powers in May 1915 . . . . . . 1
    1. Italy's Entry into the War . . . . . . . . . . . . . . . . 1
    2. The Worsening of the Economic Situation of the
       Central Powers and the Submarine Trade
       War . . . . . . . . . . . . . . . . . . . . . . . . . 12
    3. The Manpower Replacement and Ammunition Situation until
       the End of 1915 . . . . . . . . . . . . . . . . . . . . . 18
II. The Deployment and First Battles on the Italian
    Front . . . . . . . . . . . . . . . . . . . . . . . . 25
III. The Western Front from Mid-April to Early
    August 1915 . . . . . . . . . . . . . . . . . . . . . . . . 34
    1. The Battles until the Beginning of the Spring
       Offensive i

In [232]:
async def make_gpt_request_slash_n(text) -> dict:
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {openai.api_key}"
    }

    async with aiohttp.ClientSession() as session:
        async with session.post(
            "https://api.openai.com/v1/chat/completions",
            json=construct_payload_for_gpt_slash_n(text),
            headers=headers
        ) as response:
            return await response.json()


In [None]:
@log_execution_time
async def main_slash_n(model_name="", semaphore_count=10):
    semaphore = asyncio.Semaphore(semaphore_count) 
    async def wrapper_process_page(problematic, pageno: str) -> Tuple[str, Dict]:
        async with semaphore:
            response = await make_gpt_request_slash_n(problematic[pageno]) 
            return pageno, response

    tasks = []
    print(f'Composing tasks to be executed with asyncio, semaphore_count = {semaphore_count}')
    for q, pageno in enumerate(problematic):
        print(f'{pageno} ', end='')
        tasks.append(wrapper_process_page(problematic, pageno))
    print(f"main: len(tasks): {len(tasks)} -- Processing tasks as they complete")

    responses = []
    for i, task in enumerate(asyncio.as_completed(tasks)):
        try:
            pageno, response = await task
            responses.append((pageno, response))
        except Exception as e:
            logger.error(f"{i} of {len(tasks)-1} - Error processing a task: {e}.")
    return responses

# Run the async code
responses = await main_slash_n(model_name="gpt-4o-2024-08-06")


Composing tasks to be executed with asyncio, semaphore_count = 10
004 005 006 007 008 009 010 011 013 014 020 022 025 029 032 033 035 038 044 048 052 060 073 082 083 086 088 089 091 095 097 098 106 116 123 126 128 131 132 135 143 147 152 155 158 159 162 174 176 178 192 193 203 210 218 220 221 224 225 226 237 239 240 242 248 262 268 274 275 282 286 293 295 323 330 349 353 354 357 359 367 375 380 385 394 398 399 402 403 407 419 423 425 434 438 444 449 454 457 458 469 470 471 472 473 476 477 483 484 486 490 493 503 505 509 517 521 525 531 536 542 547 573 579 585 587 625 627 639 641 644 645 646 647 648 649 650 652 653 654 655 657 658 659 660 661 662 663 664 665 666 667 668 669 671 672 673 674 675 676 677 main: len(tasks): 161 -- Processing tasks as they complete


In [16]:
good_pagenos = {}
good_pagenos['Der Weltkrieg v8'] = {''}
good_pagenos['Der Weltkrieg v7'] = {''}
good_pagenos['Der Weltkrieg v2'] = {''}


In [17]:
from IPython.display import display, HTML
english_texts_fixed_paragraphs = {key: value for key, value in english_texts_defragmented.items()}

for pageno, response in sorted(responses):
    if pageno in good_pagenos[foldername]:
        continue
    result = response['choices'][0]['message']['content']
    match = re.search(r'<output_text>(.*?)</output_text>', result, re.DOTALL)
    if match:
        result = match.group(1).strip()  # Extract content and strip any leading/trailing whitespace

    print('---' * 22)
    print(f"pageno: {pageno}")
    if (len(result) < 40 and 'The input text is' in result and 'correct' in result):
        print(f'result: {result}')
    else:
        print(f'\n{english_texts_defragmented[pageno]}')
        print('result: ')
        display(HTML(f'<b style="color:blue;">{result}</b>'))
        english_texts_fixed_paragraphs[pageno] = result


------------------------------------------------------------------
pageno: 005
result: The input text is correctly OCR'ed.
------------------------------------------------------------------
pageno: 006
result: The input text is correctly OCR'ed.
------------------------------------------------------------------
pageno: 007
result: The input text is correctly OCR'ed.
------------------------------------------------------------------
pageno: 008
result: The input text is correctly OCR'ed.
------------------------------------------------------------------
pageno: 013


<pageno>6</pageno>
<header>The Agreements with Austria-Hungary.</header>
<body>that Serbia will behave calmly seems to me quite likely." However, if "despite everything" a great war comes,
Germany would be forced to first strike against France,
against which General v. Moltke, like Count Schlieffen, considered a
quick decision possible. He stated that through such a
decision, effective assistance would ultimately be provide

------------------------------------------------------------------
pageno: 014

to cooperate, then "purely spatially theoretically taken" could even 
be in question, to move back the Austro-Hungarian deployment under occupation 
of the Carpathian passes to the line Bartfeld—Neu-Sandez—Bochnia 
(35 km east of Krakow) — that meant far into West Galicia. However, there are great concerns about this. On the other hand, a deployment 
in the line Rudki (40 km southwest of Lemberg)—San estuary might be advantageous, 
to proceed from there to the offensive as soon as the Russian 
forces have reached the area of impact. General v. Conrad ties 
the question of the strength and behavior of the German forces remaining in the east 
in a two-front war, and whether they would be able to "bind 19½ Russian divisions." 
Furthermore, he asked about the strength, arrival time, and use of the reinforcements 
expected later from the western theater of war. 
General v. Moltke replied on February 24, 1909, th

------------------------------------------------------------------
pageno: 019


<pageno>12</pageno>
<header>The Agreements with Austria-Hungary.</header>
<body>Forces carry more than ever a danger within them, and continued: "If 
Austria needs all its forces to carry out the fight against Russia, the same 
applies to Germany in the fight against France. I would therefore advocate 
deploying our troops prepared in the East in the West as well, if 
consideration for Austria did not prevent me from doing so. In the 
resolution of the conflict between Germany and France lies, in my 
conviction, the focal point of the entire European war, and" (as stated in 
the Schlieffen memorandum) "the fate of Austria will not be decided on 
the Bug, but on the Seine."
Furthermore, General v. Moltke sought to accommodate the wishes of 
General v. Conrad as much as possible by expressing the hope that it might 
very soon be possible, after the opening of hostilities in the West, to deploy 
additional se

------------------------------------------------------------------
pageno: 020


<pageno>13</pageno>
<header>Moltke's View on the Role of Austria-Hungary.</header>
<body>
The command against Russia, which might become desirable or even necessary,
was dispensable for the first weeks of the war. The extremely delicate
and hardly satisfactorily solvable question of the supreme command in the East
was therefore not addressed in the peace negotiations between the two chiefs of staff.
As far as the position of the allied powers towards each other
seemed possible, the cooperation of the German and Austro-Hungarian
forces for the case of war was regulated. That nevertheless
differences of opinion remained in the basic views
is not surprising given the nature of the alliance relationship:
Thus, General v. Conrad never completely abandoned the idea
of possibly waging a war in the Balkans, regardless
of the considerable weakening of the forces available against Russia
and the resulting additional

------------------------------------------------------------------
pageno: 023


<pageno>16</pageno>
<header>The Armaments and Force Ratios until Summer 1914.</header>
<body>raises several points about the war readiness of the other great powers including Germany, namely: through the elimination of the military weakness period in winter due to retention of the oldest year group until the completed training of recruits, — through frequent practical testing of all mobilization measures with the help of trial and test mobilizations, — through the possibility of extraordinary acceleration of mobilization with the help of the "war preparation period". Based on intelligence about measures during the Balkan Wars of 1912/13, the memorandum further explained that this "war preparation period" allowed, in times of political tension, "to assign supplementary personnel and horses to the troops before the start of actual mobilization" and to prepare these units to such an extent that they could ult

------------------------------------------------------------------
pageno: 028


<pageno>21</pageno>
<header>The Weakness of the Central Powers.</header>
<body>promised power, but failed. To what extent this was clear in Vienna 
is uncertain. General v. Conrad, in any case, after the German 
Kaisermanoeuvres in 1913, reported to his Emperor that the Austro- 
Hungarian troop leadership and training were superior to the German in 
most respects and was still of the opinion at the beginning of the war 
that the peacetime training of the Austro-Hungarian army had given it a 
"tactical skill" from which it was hoped that it would form an element of 
superiority over the more cumbersome Russian masses"). The department 
of the German General Staff responsible for dealing with Austria-Hungary 
had written in a memorandum from 1913, the last before the war: 
"The numerical strength, the intensity of training, the organization 
and partly also the armament of the Austro-Hungarian army 
still le

------------------------------------------------------------------
pageno: 036


<pageno>29</pageno>
<header>July 30th. — Austria-Hungary's General Mobilization.</header>
<body>reported. There was thus no longer any doubt about Russia's intentions.
Therefore, the German Emperor replied at 4 p.m.
to Emperor Franz Joseph, saying it was "of utmost importance that Austria
deploys its main forces against Russia and does not become fragmented by simultaneous
offensive against Serbia. This is all the more important as a
large part of my army will be tied up by France. Serbia
plays a completely negligible role in the giant struggle in which we enter shoulder to shoulder,
requiring only the most necessary defensive
measures. Success in the war and thus the survival
of our monarchies can only be hoped for if we both face the new
powerful opponents with full force" ...
it was believed to be a sudden change of mind by Germany,
which could not be explained, as nothing was yet known in Vienna
about 

------------------------------------------------------------------
pageno: 038


<pageno>31</pageno>
<header>August 1st. — The German Mobilization.</header>
<body>
was on July 31st at noon; and now, according to the existing 
preparations for clarifying France's stance, at least 
24 hours had to pass before the mobilization itself could follow. 
Thus, Emperor Wilhelm II ordered the general mobilization 
for the army and navy only on August 1st at 5 p.m. Considering 
the advanced time of day, only August 2nd could now be designated as 
the "First Day of Mobilization."
Thus, the German mobilization began two days later 
than the Russian general mobilization. Adding to this, as 
mentioned, in Russia for the summer of 1914, extensive 
conscription of training troops (almost 90,000 men for six 
weeks) was already planned, that since July 27th a ban on horse 
requisitioning existed, initially targeting riding horses, but 
on the 28th was extended to all horses, and that since 
July 26th from

------------------------------------------------------------------
pageno: 052


<pageno>46</pageno>
<header>Until the Beginning of Operations.</header>
<body>
For the 8th Army, initially only troops from the eastern provinces, including all those stationed east of the Vistula, were designated. These were 9 infantry and reserve divisions as well as Landwehr troops. 5 replacement divisions, which were also planned for the army, could only be ready on the 11th day of mobilization. By then, the remaining parts of the army would have completed their assembly in the staging area in western Poland.
The border defense was initially in the hands of the active border corps. As soon as these moved westward, they were later taken over by second-line troops. Under this protection, the following were to assemble:
The Landwehr Corps in Silesia and Posen, with one division each at the border opposite Tschenstochau and Kalisch,
the 6th Landwehr Brigade at Gnesen,
the 3rd Reserve Division at Hohensalza

------------------------------------------------------------------
pageno: 057


<pageno>51</pageno>
<header>The Expected Strength Ratios in East Prussia.</header>
<body>
To deploy 5 German replacement divisions against France. Thus, it was to be expected that the strength ratios in the battle for East Prussia would initially be particularly unfavorable. They would initially face each other:
Germans:
6 active infantry divisions,
3 reserve divisions,
1½ Landwehr divisions (3 mixed brigades),
2½ fortress troop divisions (Landwehr and replacement troops from the eastern fortresses excluding Breslau and Posen),
together at most
13 infantry divisions (less than half active),
1 cavalry division.
Russians:
19 active infantry divisions (including 2 rifle brigades),
2 reserve divisions,
together at least
21 infantry divisions (nine-tenths active),
about 10 cavalry divisions.
In a comparison of the combat strength on both sides, the active German infantry divisions could be considered roughly eq

------------------------------------------------------------------
pageno: 070


<pageno>64</pageno>
<header>The Decision to Attack the Njemen Army and the Battle at Stallupönen.</header>
<body>
The Chief of General Staff marched with the 1st Army on the Njemen, with the 2nd on the Narew. The High Command arrived on August 13 from Warsaw in Wolkowysk (railway junction south of Grodno). On August 14, according to the promise given to the French¹), the attack was to begin.
The 1st Army (referred to by the Germans as the Njemen or Wilna Army) was under General of Cavalry v. Rennenkampf, previously Commander-in-Chief of the Wilna Military District, with Lieutenant General Mileant as Chief of Staff. It included:
3 Corps (III from Wilna, IV from Minsk, XX from Riga),
the 5th Rifle Brigade from Suwalki,
7 (Reserve) Infantry Divisions (53rd, 54th, 56th, 57th, 68th, 72nd, 73rd, all from the interior of the Reich)²);
in total:
13½ Infantry Divisions and
5½ Cavalry Divisions (1st and 2nd Guard Di

------------------------------------------------------------------
pageno: 075


<pageno>69</pageno>
<header>The Situation in East Prussia on the Morning of August 17.</header>
<body>the 1st Cavalry Division with parts of the 5th Rifle Brigade — perhaps only to fulfill the promise given to France at least in form — made their advance for violent reconnaissance on Marggrabowa. On August 16, the cavalry of the northern wing crossed the border and occupied Schillehnen. The infantry reached from the assembly area at the Njemen with their beginnings directly to the border, which they were to cross the following day. Thus, on the evening of August 16, they were ready at the East Prussian eastern border:
For the advance north of the Rominten Heath:
A cavalry corps (1st and 2nd Guard Cavalry Division, as well as 2nd and 3rd Cavalry Division) under Lieutenant General Chan Hüsein Nachtschwarz at Schillehnen, followed by the 1st independent Cavalry Brigade,
XX Corps east of Schirwindt,
III Corps 

------------------------------------------------------------------
pageno: 084


<pageno>81</pageno>
<header>August 18th. — Intentions and Measures of the German Army High Command.</header>
<body>made it impossible. The circumstances increasingly urged rapid 
action. To what extent this shift in the situation was realized by the High Command 
at that time cannot be determined. Its decisions, as far as known so far, were initially not influenced. 
In the thought that the enemy might immediately pursue the I Army Corps, 
Generaloberst v. Prittwitz had ordered the other corps on the evening of August 17 
to be ready to attack as soon as the XVII Army Corps was fully assembled. When 
the enemy did not initially follow, this idea was set aside again. 
They wanted to let the Russians continue to approach the Angerap. 
However, Generaloberst v. Prittwitz kept the I Army Corps 
at Gumbinnen and northwards. The 1st Cavalry Division and 
the 2nd Landwehr Brigade, positioned behind the Inster at 

------------------------------------------------------------------
pageno: 085


<pageno>82</pageno>
<header>The Battle of Gumbinnen.</header>
<body>Cavalry advanced closer to the border. These reports 
made a rapid decision against the Njemen Army appear even more 
necessary than before.
On the eastern front, the enemy had continued his advance, 
on the northern flank only hesitantly due to the previous day's battle. 
His foremost parts were established in the evening along the line Lyck–Goldap– 
Grünhof (9 km west of Stallupönen). The army high command now 
expected a total of five Russian corps: the II south of Marggrabowa, — 
then an unknown one that had advanced over Filipowo, — the IV near 
the Rominten Heath, — the III at Stallupönen, — the XX at and north of Schirwindt. 
In front of these corps, the Russian cavalry corps was assumed to be west of Pillkallen. 
Their outposts blocked the Gynmenis section.
On August 19, the German troops in the Angerapp position 
also expected the

------------------------------------------------------------------
pageno: 089


<pageno>86</pageno>
<header>The Battle of Gumbinnen.</header>
<body>
The impression of this first unfortunate clash with the enemy caused
the entire brigade to withdraw from the front to the west during the night.
It was out of action for the following days.
In the late afternoon, parts of the Russian 28th Infantry Division
advanced in dense waves against the German 1st Infantry Division.
They came nowhere closer than 600 meters to the German position.
Then their attack succumbed to the defensive fire of the East Prussians.
General of Infantry v. François, who had set up his command post 4 km
north of Gumbinnen at Lindentug, expected the continuation of the Russian
attack against his entire front on August 20. Ready for defense were now:
The main reserve Königsberg (11 battalions, 6 squadrons, 9 batteries) under
Lieutenant General v. Böck and in the over 11 km wide front from the Rominthe
west of Augstupön

------------------------------------------------------------------
pageno: 091


<pageno>88</pageno>
<header>The Battle of Gumbinnen.</header>
<body>informing that the south of the I. had been advancing since dawn. 
He recommended the attack direction towards Szirgupönen. The main reserve 
Königsberg, corresponding to the advance of this corps, concentrated its forces 
more to the north.
At 12 noon, General v. François again regulated the attack 
objectives of his divisions. He expected renewed enemy resistance 
at Kattenau, where the Russians had entrenched the day before. 
The main reserve Königsberg was also to participate in the attack, 
north of the Stallupöner road under right echelon. 
The 1st Infantry Division maintained the direction towards Kattenau, the 2nd 
was to envelop to the north.
Meanwhile, setbacks had occurred at the front: 
Immediately after the capture of Brakupönen by the 1st Infantry 
Division, the Russians counterattacked here and retook the 
place. The German 

------------------------------------------------------------------
pageno: 094


<pageno>91</pageno>
<header>August 20th. — The Attack of the XVII Army Corps.</header>
<body>retreated. Behind them, movements were observed at Warschlegen and Ribinnen,
which were also directed eastward, but upon further advance at the pivot point from Bulsdehnen to Grün-
weitschen, stronger resistance was encountered. The General Command was
reported at 7:30 in the morning that it had succeeded in preventing the enemy from continuing
their withdrawal. — The division could not know
that it now faced strong forces from the Russian 27th and 40th Infantry
Divisions, which had been alerted early in the morning and were now ready for defense from north of
Mattischkehmen to south of Sobehnen.
General v. Mackensen had observed the rapid preparation of his divisions
from the area east of Perballen and later from the area west of Grün-
weitschen. Up to the pivot point, the attack of the
German troops had advanced 

------------------------------------------------------------------
pageno: 101


<pageno>98</pageno>
<header>The Battle of Gumbinnen.</header>
<body>execute. The Quartermaster General, Major General Grünert, and the 
first General Staff officer, Lieutenant Colonel Hoffmann, had foreseen this 
decision. They believed that the battle of Gumbinnen on August 21 
could definitely be won and therefore had to be fought through; they 
pointed out that when retreating, one would no longer pass the Narew 
Army without a fight, as it had the much shorter route to the Vistula. 
One would be in a very different position if the Njemen Army had been 
defeated beforehand and thereby had freedom of movement.
Colonel General v. Prittwitz rejected these objections and 
remained in agreement with his Chief of Staff on the decision to retreat. 
He was further strengthened in his view by a somewhat later incoming 
aviation report about numerous enemy troops at Stallupönen and the 
march of a column from Sch

------------------------------------------------------------------
pageno: 108


<pageno>105</pageno>
<header>August 21st. — The Intervention of the Supreme Army Command.</header>
<body>was in close prospect in the West. However, on August 21st, there was no
thought of this. The effects of an evacuation of East Prussia were not
limited to this area. If the Russians advanced to the Vistula, as was to be
expected, they would tie down the German 8th Army there with minimal
forces and gain free rein against Posen and Silesia or against Austria-Hungary.
They could turn with overwhelming superiority against the allied army,
which was just about to launch an offensive east of the Vistula from Galicia.
The retreat of the 8th Army from East Prussia would have posed an almost
insoluble task for the Danube Monarchy and, in all likelihood, would have
resulted in the retreat of its army. Therefore, everything had to be done
to keep the 8th Army east of the Vistula.
There is no evidence that, beside

------------------------------------------------------------------
pageno: 111


<pageno>108</pageno>
<header>The Recall of Colonel General v. Prittwitz.</header>
<body>seeking contact with their own troops, but had found the way to the west
still free and now even brought back 500 prisoners. — On August 22,
the enemy in the south and east only began late and did not press
anywhere. The wing of the Narew Army did not extend beyond Mlawa
to the west. Thus, the idea had increasingly taken shape at the German
Army High Command to assemble the army on the right wing, to make
an offensive thrust from the Thorn—Allenstein line. In addition, the
proposal of the Supreme Army Command to tackle the eastern wing of
the Narew Army was now being seriously considered. Perhaps the
Narew Army could be enveloped on both sides! This idea was in the air
due to the similar training of all German senior leaders and general staff
officers. So, Colonel General v. Prittwitz also expressed the intention in
a r

------------------------------------------------------------------
pageno: 114


<pageno>112</pageno>
<header>The First Measures of the New High Command.</header>
<body>which directly faced the advancing Narew Army. It had already been 
heard on the evening of August 21¹) and again the next morning. 
General of Artillery v. Scholz and his Chief of Staff, Colonel Hell, 
continued to hope that from the defensive position at Gilgenburg they 
could find an opportunity to attack: "General situation is perceived as 
favorable; troops eager to engage the enemy." From this report and 
further messages given by Colonel Hell over the telephone, the Supreme 
Army Command inferred with joy and reassurance that a consistently 
confident mood prevailed at the XX Army Corps, in contrast to the 
Army High Command. 
And it was the same with the other corps: 
The commanding general of the I Reserve Corps, Lieutenant 
General v. Below, felt victorious "over superior forces, certainly much 
more artillery

------------------------------------------------------------------
pageno: 125


<pageno>123</pageno>
<header>The Russian Narew Army from August 19–22.</header>
<body>Hold the Guard Corps. In the following days, all 
troops near and west of Warsaw were subordinated to it. Among them were two 
infantry regiments from the I Corps, the 5th and the Caucasian 
Cavalry Division already present, the Guard Cossack Brigade from 
Petersburg and a Turkestan Cossack Brigade on the move, 
as well as the garrisons for Warsaw, 59th, and 77th, and Novo- 
georgiewsk, 79th Reserve Division. While the 2nd Army advanced to attack 
northward, these 5 1/2 infantry divisions and 
3 1/2 cavalry divisions were assigned a secondary task: the commanding 
general of the Guard Corps, General Bespajrow, was to 
cover between Warsaw and its Vistula crossings, clear the land up to the 
line Plock—Lodz—Petrikau—Piliza course from the enemy and 
prepare the later attack in the direction of Bromberg—Posen. 
While the Gu

------------------------------------------------------------------
pageno: 129


<pageno>128</pageno>
<header>The First Measures of the New High Command.</header>
<body>
The lake again, while he remained inactive further west. — 
When cannon thunder was heard again from 3:30 in the morning at the General Command, 
General v. Scholtz began to doubt whether the withdrawal 
had succeeded. However, he now had the 3rd Reserve Division advance on his own responsibility 
towards Hohenstein to have it ready for an attack. Meanwhile, the 37th Infantry Division 
could be withdrawn from the enemy undisturbed. The Russians only encountered 
parts of the 75th Infantry Brigade at Frankenau, which could escape them 
under the protection of their artillery. By 8 a.m., 
the withdrawing German security forces were in front. He also could no longer 
endanger the withdrawal of the 37th Infantry Division, but 
the threatening encirclement became increasingly noticeable. By 10 a.m., it was 
known that a Rus

------------------------------------------------------------------
pageno: 130


<pageno>129</pageno>
<header>August 24th. — Results of the Battle at Lahna and Orlau.</header>
<body>The 8th Infantry Division had left 100 prisoners and
1 flag¹) in the hands of the Vorcfschen Jäger at Orlau and had retreated
five kilometers to Grünfließ during the night; we do not know their bloody losses.
The 2nd Brigade of the 6th Infantry Division had left 2900 men dead and wounded at
Frantenau. The Russians estimated their total losses at 4000 men²). — Thus, the
battle of Lahna and Orlau represents a German success.
The troops of the XX Army Corps, which here for the first time seriously clashed
with the enemy, proved themselves against double superiority.</body>
<footer>¹) Flag of the regiment v. Diebitsch No. 29, in view of which on December 30,
1812, the treaty of Tauroggen was concluded between the Prussian General v. Yorck and the Russian General v. Diebitsch.
²) Knox p. 64. — Sischwitsch in Sbo

------------------------------------------------------------------
pageno: 132


<pageno>131</pageno>
<header>The Situation of the German Western Group on August 24.</header>
<body>
until the intervention of the I Army Corps on August 26. At the request
of the XX Army Corps, the foremost regiment of the I Army Corps
(Grenadier Regiment 1) that had arrived in Löbau was already made
available to him.
At the army headquarters in Riesenburg, the following picture of the
situation was obtained by the evening of August 24: The 5th Landwehr
Brigade had advanced from Strasburg to halfway to Lautenburg.
In its southern flank, enemy cavalry was reported at several points near
Rypin and south to the Vistula, partly moving north, and strong Russian
cavalry had also advanced directly in front of the brigade's front over
Gronau. The Landwehr was expected to deal with it during further
marching. Therefore, one could not place too much hope on the
intervention of the Landwehr Brigade in the expected b

------------------------------------------------------------------
pageno: 143


<pageno>143</pageno>
<header>The Attack Order to the Western Group for August 26.</header>
<body>
I. Army Corps by attacking its right wing towards Groß-
Grieben—Jankowitz. It is otherwise ready to transition to an attack along the entire
front with a strong right wing. — The
3rd Reserve Division is to be moved back to the area of Hohen-
stein in time.“
3. The Operations of the Russian Northwestern Front up to
August 26.
(Map 5 and Sketch 6, p. 132, and 7, p. 150.)
The commander-in-chief of the Russian Northwestern Front1), General
Schilinski, had not approved the turning of the Narew Army in a northwestern
direction until now and even on August 23 rejected the proposal of
General Samsonov to lead the army against Allenstein—Osterode instead of
Rastenburg—Seeburg2). On this day, however, he finally received the report that the Angerapp position
had been abandoned by the Germans. The simultaneous flight of 

------------------------------------------------------------------
pageno: 149


<pageno>149</pageno>
<header>August 26th — The Attack of the I Army Corps.</header>
<body>1145 received. He had then ordered that the 1st Infantry Division should
take possession of "the heights northwest of Seeben" at 4 a.m. and
attack Usdau further at 10 a.m., the 2nd Infantry Division should
advance at 7 a.m. from Kielpin via Groß-Roschlau to Groß-Tauersee, the
5th Landwehr Brigade at the same time from Lautenburg via Heinrichs-
dorf to the area north of Borchersdorf. These orders
corresponded to the army order, but given the
time found at the 1st Infantry Division, they were only feasible
if everything was prepared to ensure the timely deployment of the troops
for the early attack. Nothing had happened here.
General v. François was still of the opinion that his army
corps was not yet in a position to attack on the morning of August 26th.
The damage that a delay could cause seemed
insignificant to him c

------------------------------------------------------------------
pageno: 151


<pageno>152</pageno>
<header>The Victory over the Flanks of the Narew Army.</header>
<body>
Heights west of Meischlis, with their northern flank reaching the area south of Groß-
Grieben. Their Grenadier Regiment I had, after an advance
it had made from the north towards this place, returned to Wansen.
The enemy was not facing the troops of the German I Army Corps with advanced detachments, but its main
forces seemed to be entrenched in the line Groß-Tauersee—Usdau.
At the General Command of the XX Army Corps, it was already
clear on the night of August 26, after the telephone conversation with the
I Army Corps, that the attack there would not begin as early
as ordered by the Army High Command. Therefore, an
order to advance the right wing on Groß-Grieben—
Jankowitz was initially not given. They waited.
Thus, the Army High Command had to come to terms with the delay
that had occurred at the I Army Corps.
Th

------------------------------------------------------------------
pageno: 152


<pageno>153</pageno>
<header>August 26th. — The Attack of the XX Army Corps.</header>
<body>Sees the advance of strong Russian forces against the positions of the 
37th Infantry Division became noticeable. Since noon, Russian artillery 
had been shelling the German trenches occupied by the Landwehr near Mühlen. 
Further north and at Hohenstein, no enemy had been reported so far. 
Thus, the 3rd Reserve Division received the order, according to the army command 
of the evening of August 25th, to advance towards Hohenstein over the Orzewn section. Otherwise, for the 
attack of the XX Army Corps, only its southern wing was initially considered. 
General v. Scholz ordered the 41st Infantry Division to attack the line 
Ganshorn—Groß-Gardienen, while the 37th was only to 
participate in the attack with its southern, 75th Infantry Brigade. When 
this order was given to the divisions at 2:45 p.m., 
General v. Schol

------------------------------------------------------------------
pageno: 159


<pageno>160</pageno>
<header>The Victory over the Flanks of the Narew Army.</header>
<body>
Division and 1st Rifle Brigade), launched the attack. He presumably 
carried out the order given to him as early as August 26th1). The Russian 
thrust hit the German 5th Landwehr Brigade and the southern flank of 
the 2nd Infantry Division. Due to the first erroneous report about the 
capture of Usdau, the Landwehr Brigade had already been deployed early 
in the morning at Borchersdorf by the General Command. At the same 
time, the 2nd Infantry Division had received orders not to wait any longer 
but to accelerate their attack. Thus, the German attack collided with the 
Russian one and was partially hit in the flank. This led to a particularly 
fierce battle on the southern flank of the German I Army Corps.
The 5th Landwehr Brigade under Lieutenant General v. Müllmann 
had set out with the two Pomeranian Landwehr Re

------------------------------------------------------------------
pageno: 171


<pageno>173</pageno>
<header>August 26th. — The Victory of the Eastern Group at Groß-Bössiau.</header>
<body>Russian division standing. Opposite the German 36th Infantry Division, 
the enemy continued to move further east and thereby temporarily put 
their left flank in a difficult position. General staff and division had the 
impression of facing the entire Russian VI Corps. However, the German 
35th Infantry Division had not yet arrived by noon. They had to rest 
on the way. When they finally reached Groß-Koellen with the vanguard 
after a 25 km march, they were so exhausted that they had to rest again.
Meanwhile, the I Reserve Corps, which initially had no enemy in front 
of it, had only completed its deployment west of the Seenette in the line 
Alt-Bierzighufen—Kirschdorf by 12:30 p.m. In agreement with General 
v. Mackensen, General v. Below now wanted to continue the march south 
with the mass of his

------------------------------------------------------------------
pageno: 183


<pageno>185</pageno>
<header>The Attack of the 41st Infantry Division.</header>
<body>at least three Russian divisions in front of his line. 
Further south, it suspected the enemy identified at Bujaken 
to be the Russian XXIII Corps. However, this corps was considered 
so heavily beaten that it was not expected to have significant offensive power. 
To align with the intentions of the Army High Command, 
the 41st Infantry Division had received orders to 
advance around Lake Mühlen towards Paulsgut, 
to get behind the Russians, reaching the line 
Lutten—Ganshorn (east of Mühlen) by 4 a.m. The division was to 
“avoid the threat from Bujaken by marching in the 
darkness” and “deploy a strong rearguard with artillery.” 
As soon as its attack became noticeable at Paulsgut, 
the troops deployed at the Drewenz front were to 
launch an attack. They were all subordinated to the commander 
of the 3rd Reserve Division

------------------------------------------------------------------
pageno: 191


<pageno>193</pageno>
<header>The Capture of Soldau by the I Army Corps.</header>
<body>
v. Schmettau, which had been halted somewhat earlier, and the 
1st Infantry Division were set in motion towards Neidenburg.
Meanwhile, further unfavorable reports had been received at the General Command of the XX Army Corps 
and at the Army High Command regarding the situation with the 41st Infantry Division1). It seemed 
that the division could not hold even at Wronow (between Kontin and 
Thymau Lake), where it had initially retreated. 
It was also expected that the Russian pressure in the south would increase 
if the enemy — as was quite possible — was already fighting for its retreat. 
Everything had to be done to close off the northern escape route by the large lakes again. Therefore, 
Generaloberst v. Hindenburg ordered the I Army Corps at noon to 
support the 41st Infantry Division, which was retreating from Wron

------------------------------------------------------------------
pageno: 200


<pageno>202</pageno>
<header>The Deployment of the Russian Center on August 28.</header>
<body>to position the 36th north of it over Darethen. However, it became very
questionable whether it would still be possible to catch the Allenstein enemy
on the march in the flank. Lieutenant General v. Below
wanted to reach Grieslienen by evening. It did not succeed, as
Russian resistance and poor roads caused the 1st Reserve Division
unexpected delays. Thus, Lieutenant General v. Förster
reached the main road at Darethen only at nightfall. Here, new
nighttime battles unfolded. Only after midnight did the troops, rifles
in hand, find rest. North of the 1st lay the 36th Reserve Division, parts
of which had cleared Allenstein of the last Russians. However, the order
intercepted by Captain Bartenwerfer, that the corps should attack the
enemy marching on Hohenstein "today," finally
reached the General Command only at 8:

------------------------------------------------------------------
pageno: 201


<pageno>203</pageno>
<header>The Situation in the Evening.</header>
<body>resumed. Russian radio messages, according to which the southernmost 
corps of this army, the Russian II Corps, was to begin the retreat to the 
border to be transported by rail, seemed incredible under these circumstances. 
A report from the German 1st Cavalry Division, stating that it was at Rissel 
and had no enemy in front of it in the east, as well as the further content of 
the report from the governor of Königsberg, that the enemy had so far only 
crossed the Alle with cavalry, could not dispel the impression that Rennenkampf 
had now recognized the situation and would rush to the aid of the Narew Army 
with all its forces. To the German leadership, this seemed so self-evident that 
there had been no explanation for General v. Rennenkampf's behavior for days. 
Therefore, his imminent intervention in the battle was now expected

------------------------------------------------------------------
pageno: 211

accordingly, to gather the troops of the 37th Infantry Division. 
From the I Reserve Corps, which received the order to withdraw troops 
only at 5 p.m. personally from General Ludendorff, 
initially only one brigade could be provided. The 
bulk of the corps had continued the pursuit behind the German 6th Reserve Infantry Brigade 
(from the 3rd Reserve Division) over Schlagamühle, while the detachment advancing east of Lake Lanskter 
had attacked Russian columns and trains early in the morning, but then remained exhausted 8 km northeast of 
Kurten. It no longer disturbed the Russian retreat. 
On the evening of August 29, the Landwehr Division Goltz was at 
Hohenstein, the 37th Infantry Division at Grieslienen, from the I Reserve 
Corps 1 1/2 divisions were at Schwedrich and Schlagamühle, 1/2 division 
with the General Command at Allenstein. 
At the XX Army Corps, General v. Scholtz had ordered the 3rd Reserv

------------------------------------------------------------------
pageno: 213


<pageno>215</pageno>
<header>The I and XVII Army Corps on August 29.</header>
<body>The Uhlan Regiment Count zu Dohna (East Prussian) No. 8, under its 
commander Lieutenant Colonel Freiherr Schäffer v. Bernstein, together 
with parts of the Dragoon Regiment 10 and the Field Artillery Regiment 1, 
advancing south of the main road, captured Russian trains and made 
5000 prisoners. The regiment remained in Groß-Pantheim. Furthermore, 
Lieutenant Colonel Berring with the Mounted Jäger Regiment 
No. 10 (from the 2nd Infantry Division) had already reached near 
Willenberg by 3 p.m., after overtaking the troop detachment of the 
XX Army Corps under Lieutenant General v. Schemtau. 
After a morning break of only thirty years, they set off again from Muschaken 
and reached Willenberg by 7 p.m. Since the departure 
from the bivouac on the morning of August 28, his troops, apart 
from the development into battle again

------------------------------------------------------------------
pageno: 226


<pageno>228</pageno>
<header>The Conclusion of the Battle, August 29 to 31.</header>
<body>
troops were held everywhere. A final push by the Russian 1st Cavalry
Division against Allenstein was repelled early on August 31 by parts of the
6th Landwehr Brigade. Then the Russian cavalry masses gradually withdrew
eastward, fighting and burning, and also destroyed all telegraph lines and
important railway facilities.
4. The Capture of the Encircled Russians and the
Result of the Victory.
(Map 11.)
While parts of the German I and XVII Army Corps repelled Russian
relief attempts at Neidenburg and Ortelsburg, the fate of the encircled
Russians was sealed in the large forest area between the two locations. The
ring that surrounded them was thin. Along the 50 km stretch from Muschaken
via Willenberg to Jedwabno, only 29 German battalions, weakened by previous
battles, stood in the confusing terrain. A determined and 

------------------------------------------------------------------
pageno: 234


<pageno>236</pageno>
<header>The Conclusion of the Battle, August 29 to 31.</header>
<body>the XXIII Corps heavy fighting at Hohenstein—Neidenburg; where 
these troops were on the evening of August 29 is unknown. 
The I Corps is in the area of Mlawa, the VI near Ortelsburg, parts 
of the XXIII near Prasnycz, a small detachment in Chorzele. On 
August 29 at 3 p.m., the enemy cavalry occupied Jägersdorf 
and Janow, thereby interrupting the connection of General 
Samsonow with Chorzele. The army group has ordered that the 
I Corps, the VI, and parts of the XXIII advance against Willenberg—Neidenburg 
to assist General Samsonow and cover his flank and 
rear. General v. Rennenkampf has set a reconnaissance 
advance of the cavalry in the area of Allenstein—Passenheim 
to clarify the situation and assist General Samsonow." 
At the same time, the commanding generals of the VI and 
XXIII Corps and the newly appoint

------------------------------------------------------------------
pageno: 240


<pageno>242</pageno>
<header>Reflections on the Battle.</header>
<body>The idea of the double envelopment arose naturally from the situation 
the new leaders encountered on August 23. Whether this idea was 
feasible depended on the Russian Njemen Army and, since the corps of 
the German Eastern Group marched south, on the outcome of their 
battle against the local enemy. Thus, the Army High Command could 
initially only strive for the destruction of Samsonov's army through an 
attack from the west. However, time and resources were insufficient to 
use the most effective means for this, the envelopment with a strong 
southern wing, possibly via Soldau. Therefore, General Major Ludendorff 
proposed the attack against the front at Usdau and further towards 
Neidenburg. The battle began with a "breakthrough"; the attack on 
Usdau was also intended to simultaneously cut off the Russian center 
and northern wing

------------------------------------------------------------------
pageno: 241


<pageno>245</pageno>
<header>The Magnitude of the Victory.</header>
<body>defeated a superior enemy1), while at the same time both
flanks were threatened by further superior forces. Military
history has no example of a similar achievement to
show — at Cannae, the threat from the rear was missing.
The magnitude of the success is best characterized by
a comparison of the mutual losses and the extent of the
booty. The total losses amounted, including the battle at
Gumbinnen and Orlau on August 23, as far as they have been determined so far:
on the German side, a total of about 12,000 men, on the Russian side, one must
reckon with more than ten times that number.
While on the German side only about 7% of the strength of the
fighting troops were lost, the losses on the Russian side rose to
about 75%. In addition, about 350 guns fell as booty, as well as an
abundant, but no longer ascertainable number of horses,

------------------------------------------------------------------
pageno: 242


<pageno>244</pageno>
<header>* Reflections on the Battle.</header>
<body>from leaders and troops, and thus every spirit of enterprise on the Russian
side diminished more and more. And this condition was not limited to the defeated
Narew Army, but spread to all Russian
units that had German troops as opponents.
For East Prussia, the worst was initially averted. In
Germany, the victory dispelled the fear of the Russian "steamroller,"
which had weighed heavily on Berlin and
the eastern provinces since the retreat from Gumbinnen. In some places, however, there was also
a tendency to overestimate the significance of the incomparable victory. Its
immediate effect remained confined to the East Prussian theater of war.
Of the ten armies that the Russians had deployed, only one was
dealt with. The enormous numerical disparity in the East continued to exist,
and at the front of the allies, the situation had meanwhil

------------------------------------------------------------------
pageno: 244


<header>First Chapter.</header>
<header>The Galician Front and the Overall Situation in the East until Early September.</header>
<header>I. Austria-Hungary's Offensive between Bug and Vistula¹).</header>
<body>(Map 1 and Sketch 9, p. 267.)
The Austro-Hungarian Chief of General Staff assumed that the Russians would march against Bukovina and Galicia in a wide arc, from the Romanian border to the Vistula, but would initially leave the area west of this river free. In detail, he assumed the following distribution of forces according to the calculations made in peacetime²):
up to the Russian
20th Mobilization Day 30th Mobilization Day
(= August 19) (= August 29)
south of the Dniester . . . . . . 4 Inf. Div. 8 Inf. Div.
in Podolia near Proskurov . . . . . . 10 " 16 "
in Volhynia near Rovno . . . . . . 7 " 12 "
between Bug and Vistula . . . . . . 14 " 24 "
total . . . . . . 35 Inf. Div. 60 Inf. Div.
In contrast,

------------------------------------------------------------------
pageno: 247


<pageno>250</pageno>
<header>The Galician Front and the Overall Situation in the East until Early September.</header>
<body>
together: 870 battalions, 350 squadrons, and 350 batteries);
— additionally
1 cavalry division and
44 infantry battalions [this involved 2½ divisions of Landsturm²)] as a special army group near Krakow. This group was to cross the border on August 17 and advance shoulder to shoulder with the German Landwehr Corps against the Vistula north of the San estuary.
For the main forces to be assembled between Stanislau and the San estuary, it was stated: "From this assembly, the offensive will begin — depending on the deployment and behavior of the Russian forces — in an easterly direction (in the unlikely event that the mass of the Russians marched against the Galician eastern border) on the 20th, in a northerly direction on August 22 from the left flank. To support this offensive of the Au

------------------------------------------------------------------
pageno: 248


<pageno>251</pageno>
<header>August 1914. — The Demand for the German Offensive over Siedlce.</header>
<body>
in view of the expected strength ratios at the start of the war on the East Prussian front — not given at all1).
According to the German view, the forces in East Prussia had fulfilled their task towards the allies if, as General v. Conrad demanded in 1909, they prevented the Russian 1st and 2nd Armies from "turning further forces against the Austro-Hungarian Army." If it was possible to achieve more, then it was self-evident that it would be done. The instructions for the German 8th Army2) fully took this into account.
Emperor Franz Joseph had transferred the supreme command over all land forces3) and the fleet of Austria-Hungary to General of Infantry Archduke Friedrich of Austria. He was assisted by the Chief of General Staff, General of Infantry Baron Conrad v. Hötzendorf, as the actual leader o

------------------------------------------------------------------
pageno: 250


<pageno>253</pageno>
<header>August 20th — The Execution of the Deployment in Galicia.</header>
<body>its mass advanced to the area east and north of Lemberg to hold the area around this city. The 4th and 1st Armies, designated for the attack, were to reach the line Zlienyów (50 km northwest of Lemberg)–Tanev–and San course to the Vistula by August 21st, with the army group v. Kummer on the extreme left wing to reach the Vistula west of Krasnik as quickly as possible. The German Landwehr Corps followed to the left rear.
By August 20th, the armies had achieved the following strengths:
Army Group Kövess and 3rd Army:
4 Corps Commands,
13½ Infantry Divisions,
5 Landsturm Brigades,
1 March Brigade,
6 Cavalry Divisions.
4th and 1st Army and Army Group Kummer:
5 Corps Commands,
17 Infantry Divisions,
2 Landsturm Divisions,
4 Landsturm Brigades,
2 March Brigades,
4 Cavalry Divisions,
Polish Legion¹).
Occupation T

------------------------------------------------------------------
pageno: 258


<pageno>261</pageno>
<header>Early September. — The Request for German Support.</header>
<body>After receiving knowledge of East Prussia, it was hoped that after the
German victories in France and East Prussia, immediate support by German
troops in Galicia would now also be possible. Thus, General v. Conrad
telegraphed to the German Supreme Army Command on September 1:
Despite the success of the 1st and 4th Army, the situation is extremely
critical, as the remaining forces are "pushed back by a vastly superior
enemy, severely weakened in combat, still holding the northern wing of
Lemberg, and highly endangered by Russian advances along the Dniester.
He only expects two more divisions and therefore urgently requests a
decisive change in the situation with the deployment of fresh German
forces, if possible at least two army corps, direction Pischemysl."
A report received on September 2 from Captain v. Fleisc

------------------------------------------------------------------
pageno: 262


<pageno>265</pageno>
<header>End of August. — The Decision to Carry Out the Attack Against Galicia.</header>
<body>
To quickly complete East Prussia. Since August 26, the Supreme Command
had been considering the (already mentioned) idea of moving the bulk of the
1st Army by rail via Warsaw to the western Vistula bank. Even on August 28,
General Quartermaster Danilow responded in this sense to the objections of the
Northwest Front: "It is impossible to leave nine corps on the right Vistula bank
while the entire weight of further operations lies on the left bank." The news of
the unfavorable situation at the 2nd Army initially ended this idea. The Supreme
Command did not assume on August 29 either that a disaster of decisive
importance could occur there*).
At the same time, the difficult situation between the Vistula and Bug,
where the 4th and 5th Armies were pressed by a superior enemy, ruled out
further ho

------------------------------------------------------------------
pageno: 265


<pageno>269</pageno>
<header>Early September. — The German Forces.</header>
<body>At the beginning of September, the German 8th Army was positioned as follows:
35th Reserve Division (main reserve Thorn²), consisting of Landwehr), main reserve Graudenz³) (a brigade) and
70th Landwehr Brigade north of Mlawa and near Neidenburg,
I Army Corps and 3rd Reserve Division from Neidenburg to Willenberg,
XVII Army Corps around Ortelsburg,
XX Army Corps with one division each north of Neidenburg and near Allenstein,
Landwehr Division Goltz near Mühlen,
I Reserve Corps with assigned 6th Landwehr Brigade near Allenstein and Guttstadt,
1st Cavalry Division in front of the eastern front of the army, main reserve Königsberg (one division) and 2nd Landwehr Brigade in the Deim position.
Behind the front of the army, since September 2nd, the following were unloaded:
XI Army Corps near Allenstein and Osterode,
Guard Reserve Co

------------------------------------------------------------------
pageno: 277


<pageno>281</pageno>
<header>September 6–8. — The Advance of the German Northern Wing.</header>
<body>To tie up reserves there so that they could not be moved south against the German
attack wing.
Thus, the German corps were to close north of the lakes on September 7
in the line Rastenburg—Friedland. In the army order for
that day it said: "Should the enemy take the offensive, the
corps are to hold the line reached." For their own safety and to
tie up the Russian reserves, more troops for the northern
wing would have been urgently desired. Apart from the main reserve from Posen,
no significant reinforcements were available. The
army command considered bringing in parts from the East Prussian eastern front.
But everything there was in flux to the east, the flank
was getting longer every day, there was nothing free. The government of
Königsberg was ordered to push stronger forces than before for use south of

------------------------------------------------------------------
pageno: 279


<pageno>283</pageno>
<header>The Attack Order for September 9.</header>
<body>
Reconnaissance found the terrain up to the Narew free from the enemy.
According to agent reports, troops were supposed to have been transported
from Warsaw by rail to the east. However, the Russian Guard Corps,
previously suspected near Warsaw, had to be assumed to be in front of the
German Landwehr Corps in southern Poland according to a report. Thus,
no disturbance was expected from the Narew for the time being.
A threat to the German encircling wing from
the south and east, however, became increasingly apparent: reports of
the assembly of Russian troops at Schtschutschin and Grajewo were
available from aviators. There, it initially seemed to involve only weaker
forces, individual brigades or regiments. While the parts of the 1st and
3rd Finnish Rifle Brigade defeated at Bialla had retreated to the northeast,
aviators now obse

------------------------------------------------------------------
pageno: 295


<pageno>299</pageno>
<header>The State of the Pursuit on the Evening of September 11.</header>
<body>held up the Pregel for a long time, as all its bridge equipment had still been installed at 
Omet and Alle. The corps found no enemy north of the Pregel 
and advanced with its foremost parts halfway to 
Norditten—Insterburg. Further north, the main reserve Posen followed in echelon to the left rear, and the main reserve Königsberg had advanced over Labiau.
At the army headquarters, there was soon no doubt after the 
alarming report of the XI Army Corps that the enemy was only fighting for retreat. However, it was not 
possible to immediately reverse the ordered turning of the I and XVII Army Corps 
to the north. The commanding generals of both corps knew anyway that the pursuit 
in a northeasterly direction should be resumed as soon as the situation allowed. Since the XVII Army Corps was already pinned by t

------------------------------------------------------------------
pageno: 298

arrived late at night beyond Walterkehmen, where 1000 prisoners 
were taken, and beyond Trakehnen. — The XI Army Corps had 
kept its forward division close to the enemy and 
fought several times. During this, the commander of the 2nd Thuringian 
Infantry Regiment No. 32, Lieutenant Colonel Fischer, was killed. By noon 
Gumbinnen was reached and in the evening, with the 22nd Infantry Division at the 
beginning, the area north of Trakehnen was reached. — The I Reserve Corps, 
assigned to Pillkallen, caught up with the enemy in the afternoon north of 
Kattenau and east of Mallwischken, but was unable to break his resistance 
here on this day. — The Guard Reserve Corps 
found no more enemy in front of them. General v. Gallwitz therefore received 
new orders in the morning. The Army High Command ordered 
the cessation of the pursuit on this flank. Only the main 
reserve Königsberg was left advancing on Tilsit. T

------------------------------------------------------------------
pageno: 301


<pageno>305</pageno>
<header>September 9–16.</header>
<body>
Since there was no longer any doubt about the Njemen Army, they could do without half of the 35th Reserve Division at Königsberg. It was immediately sent back to the southern front at Soldau.
Meanwhile, the situation had already cleared up there: The enemy did advance to the border on September 10, but only crossed it at Myschliniez, and even there only with small cavalry detachments. Otherwise, they remained in place and entrenched. However, the German Army High Command ordered, despite the weakness of their own troops, on September 12, the Governor of Graudenz and the Deputy Commanding General of the XX Army Corps to attack the enemy located north of the Narew, to deliver a decisive blow. — To this end, General v. Saffron and General Count Schlieffen had their weak Landwehr troops advance to attack across the border from the Soldau—Willenberg l

------------------------------------------------------------------
pageno: 304


<pageno>308</pageno>
<header>The Battle of the Masurian Lakes.</header>
<body>the lagoon or from the sea as feared. Thus, behind the already heavily 
fortified Deime front, the XX Corps (28th and 29th Infantry Divisions) 
was also held in reserve. At Gilft, parts of the 68th Reserve Division 
were stationed. The High Command of the Northwest Front even wanted 
to have an entire infantry division and a cavalry brigade ready there. 
The 54th Reserve Division was at Insterburg¹), the 72nd at Darkehmen. 
There is no evidence that General v. Rennenkampf considered using these 
strong reserves for a decisive strike, as was deemed possible by the Germans. 
The army cavalry was gradually withdrawn behind the front. These were: 
½ 1.²) Guard Cavalry Division, the 2nd and 3rd Cavalry Divisions united 
under General Chan Hussein into a corps, and the 2nd Guard Cavalry 
Division. The 1st Cavalry Division was intended 

------------------------------------------------------------------
pageno: 313


<header>Third Chapter.</header>
<header>The Sufferings of East Prussia.</header>
<body>Vast areas of the core German province of East Prussia suffered greatly 
under the military events of the summer of 1914. The situation of the province 
meant that it was more exposed to enemy incursions than other parts of the country.
Even the peacetime expansion of the country's fortifications in the east showed 
that the responsible authorities of the Reich anticipated the possibility of a 
Russian advance up to the Vistula. The line of this river had been increasingly 
strengthened, especially in the years leading up to the war; the facilities in East 
Prussia itself had to take a back seat. This was particularly discussed in 1913 
when the use of the "defense contribution" funds was addressed and had led to 
concern in East Prussia at that time. General von Heeringen, as Minister of War, 
had tried to counteract th

------------------------------------------------------------------
pageno: 323


<pageno>328</pageno>
<body>the foremost Russian reconnaissance units often set fire to barns and 
straw stacks to indicate how far they had come. — 
The impact of the fighting itself claimed entire towns. Gerdauen 
(3000 inhabitants) and Hohenstein (2500 inhabitants) were almost completely 
burned down. Neidenburg, where allegedly residents participated in the fighting, 
was set on fire by the Russians. Soldau, Ortelsburg, 
and many other towns suffered more or less severely from the fighting. Since in rural areas all rooms and storages were immediately 
filled with hay and straw after the harvest, almost every 
artillery bombardment resulted in major fire damage. Numerous villages, 
especially the village of Mühlen on the battlefield of Tannenberg, and such 
in front of the eastern front of Lötzen were almost completely incinerated, a large number 
of estates and farms were destroyed. Nevertheless, the co

------------------------------------------------------------------
pageno: 331


<pageno>336</pageno>
<header>The Battle of Lemberg and the Evacuation of Eastern Galicia.</header>
<body>
to completely halt the western wing1), when the collapse of the Austro-
Hungarian 1st Army brought them an unexpected victory. The
great goal of encircling the armies of the Danube Monarchy on both sides
was not achieved. The Russian forces were unfavorably positioned for
an overwhelming pursuit and had to be redeployed. Only the cavalry
massed under General Nowikow west of the Vistula at Radom, consisting
of 3½ divisions with 3 infantry regiments, could have threatened the
retreat of the Austro-Hungarian army; but they held back.
The official Russian war report judges2) the overall course
of the operations so far: "In general, the sense of obligation
to our allies influenced all measures of our
supreme command to such an extent that it even sacrificed
essential prerequisites for the strategic success 

------------------------------------------------------------------
pageno: 338


<pageno>345</pageno>
<header>Moral Seriousness and Efficiency of the German Troops. — March Achievements.</header>
<body>5th Landwehr Brigade on August 26 and 27, the attack of the 6th Landwehr
Brigade on the 26th, the Landwehr Division Golz on the 28th, and the Landwehr
Corps at Tarnawka are pages of glory for the German Landwehr.
To reach the enemy, extraordinary marching achievements had to be demanded
from almost all units. After the first days, when the small number of less resilient
had fallen out, the marching losses stopped; the troops had become hardened like steel.
Thus, they often performed with the dedication of their last strength what was demanded,
just as they had practiced and learned so often in peacetime in hard training.
Often dead tired and barely supplied, the troops marched on dusty country roads or
on deep sandy paths, in the scorching midday heat of August or — as before the
Battle 

------------------------------------------------------------------
pageno: 348


<header>Appendix 1.</header>
<header>Military Formations.</header>
<header>Preliminary Note.</header>
<body>
The military formations reflect the composition of the armies for a specific point in time or period; earlier or later deviations in the composition are indicated. All parts added later are highlighted by special script.
The following abbreviations, which may not be immediately understandable, were used:
A.K. = Army Corps
Abt. = Division
Drag.R. = Dragoon Regiment
Erg./147 = Replacement Battalion Infantry Regiment No. 147
F.A.R. = Field Artillery Regiment
Füs.R. = Fusilier Regiment
G.R. = Guard Regiment
Geb.Brig. = Mountain Brigade
Gr.R. = Grenadier Regiment
Hus.R. = Hussar Regiment
Inf.R. = Infantry Regiment
Jäg.B. = Jäger Battalion
Jäg.R. z. Pf. = Mounted Jäger Regiment
K.R. = Cuirassier Regiment
Ldst.R. = Landsturm
Ldsch. = Landwehr
M.G.A. = Machine Gun Division
Pi.1 = Pioneer Battalion No. 1
1./

------------------------------------------------------------------
pageno: 349


<pageno>358</pageno>
<header>Order of Battle.</header>
<body>
The German 8th Army
on August 26, 1914, 1st day of the Battle of Tannenberg¹).
9 Inf. and Res. Div., 4 Div. Det. and Replacement Troops, 1 Cav. Div. = 158 Batl., 78 Squadrons
140 Batteries (= 774 Guns²).
Strengthened by early September to:
13 Inf. and Res. Div., 5½ Div. Landwehr and Replacement Troops, 2 Cav. Div. = 229 Batl., 119 Squadrons
219 Batteries (= 1194 Guns).
Additionally, Landwehr Corps with 2 Landwehr Div. = 34 Batl., 12 Squadrons, 12 Batteries
(= 72 Guns).
Army High Command 8.
Commander-in-Chief: Generaloberst v. Beneckendorff and v. Hindenburg (until 22. 8. General-
oberst v. Prittwitz and Gaffron)
Chief of Gen. Staff: Genmaj. Ludendorff (until 22. 8. Genmaj. Count v. Waldersee)
1st Gen. Staff Officer: Lt. Col. Hoffmann
Quartermaster General: Genmaj. Grünert
Gen. of Engineers: Genmaj. Resten.
Air Forces.
Field Flying Det. 16
Fortre

------------------------------------------------------------------
pageno: 350
result: The input text is correctly OCR'ed.
------------------------------------------------------------------
pageno: 351
result: The input text is correctly OCR'ed.
------------------------------------------------------------------
pageno: 352
result: The input text is correctly OCR'ed.
------------------------------------------------------------------
pageno: 353


<pageno>362</pageno>
<header>Military Formations.</header>
<body>
Fort Boyen (Lötzen).
4½ Battalions, 1 Squadron, 8 Batteries.
Commander: Colonel Busse
Infantry: Replacement/R.R. 147, IV./Ldw. 18 and 2½ Battalions Ldst.
Machine Guns: 11 Fortress Detachments and Troops
Cavalry: Replacement Squadron Drag. R. 11
Field Artillery: 2 Ldw. Batteries from XX. A.K.
Foot Artillery: 6 unassigned Batteries.
Pioneers: 1 Fortress Battalion Detachment.
Air Forces: 1 XX. Army Corps.
From the Army's Line of Communication Inspection (on 30th and 31st August
call

------------------------------------------------------------------
pageno: 354
result: The input text is correctly OCR'ed.
------------------------------------------------------------------
pageno: 355
result: The input text is correctly OCR'ed.
------------------------------------------------------------------
pageno: 356
result: The input text is correctly OCR'ed.
------------------------------------------------------------------
pageno: 357
result: The input text is correctly OCR'ed.
------------------------------------------------------------------
pageno: 358


<pageno>367</pageno>
<header>The Austro-Hungarian Army.</header>
<body>
2nd Army.
7 Inf. Div., 4 Lfst. Brig., 2 Cav. Div.
= 145 1/4 Batt., 67 3/4 Squadrons, 64 Batteries (= 368 Guns)
Part of the army formed the Kövess Army Group until August 23 (see below).
Army Commander: General of Cavalry v. Böhm-Ermolli ¹)
Chief of Gen. Staff: Genmaj. v. Meenfessy
IV Corps ¹): 31 ¹) and 32 ¹) Inf. Div., 4th March Brig.
VII Corps ¹): 17 

------------------------------------------------------------------
pageno: 359
result: "The input text is correctly OCR'ed."
------------------------------------------------------------------
pageno: 360
result: The input text is correctly OCR'ed.
------------------------------------------------------------------
pageno: 361
result: The input text is correctly OCR'ed.
------------------------------------------------------------------
pageno: 362
result: The input text is correctly OCR'ed.
------------------------------------------------------------------
pageno: 363
result: The input text is correctly OCR'ed.
------------------------------------------------------------------
pageno: 364
result: The input text is correctly OCR'ed.
------------------------------------------------------------------
pageno: 365
result: The input text is correctly OCR'ed.
------------------------------------------------------------------
pageno: 366
result: "The input text is correctly OCR'ed."
------------

In [18]:
from src.utils import dump_fixed_paragraphs_output_to_json

dump_fixed_paragraphs_output_to_json(foldername, english_texts_fixed_paragraphs)

doc2, fname2 = save_document(english_texts_fixed_paragraphs, foldername, language=f'{foldername} - English_fixed_paragraphs')


save_document - L197 - INFO - saved to "../output_data/Der Weltkrieg v2/Der Weltkrieg v2 - English_fixed_paragraphs"


In [50]:
# print(german_texts['017'])
print(english_texts_defragmented['017'])
print(english_texts_fixed_paragraphs['017']) 



<pageno>10</pageno>
<header>The Agreements with Austria-Hungary.</header>
<body>
Support was expected. However, given the circumstances, a decision against France was impossible. After all, the Bosnian crisis had just reached its peak. In such a situation, one could not question the results of all previous negotiations but had to try to help as best as possible. Thus, General v. Moltke — albeit with a heavy heart — decided to make the commitment.
He wrote about it: An attack by weak German forces against the fortified Narew line would indeed face great difficulties and was also "threatened on the right flank by Warsaw, on the left by counterattacks from Lomsha. — Nevertheless, I will not hesitate to make the attack to support the simultaneous Austrian offensive. Your Excellencies can rely on this promise, which is well considered. The condition is that the movements of the allies are initiated simultaneously and carried out unconditionally. — Should the execution of the intentions be 

In [16]:
# print(raw_german_texts['117'])
# print(german_texts['117'])
print(english_texts['117'])
print(english_texts_defragmented['117'])


<pageno>211</pageno>
<header>The Southern Army Halts the Advance to the Southeast.</header>
<body>General von Linsingen, however, did not consider the movements of the troops under Field Marshal Lieutenant Szurmay sufficient to reliably secure his left flank. This seemed assured to him only if the enemy was pushed back over the Dniester, which could only be achieved by deploying stronger forces. Reserves were not available. The divisions of Bothmer's corps stationed on the northern bank of the Dniester were also engaged in fierce defensive battles since early morning and urgently needed support themselves. Under these circumstances, there was no choice but to refrain from continuing the offensive of the right wing of the army in a northeasterly and northerly direction and to focus on defending what had been achieved, in order to free up his main forces for a counterattack to the northwest. On the morning of June 9, General von Linsingen ordered the Gerok corps, which was engaged in an