In [3]:
with open("sources/the_rubaiyat.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

cleaned = []
prev_blank = False
for line in lines:
    line = line.strip()
    if not line:
        if not prev_blank:
            cleaned.append("")  # keep only one blank line
        prev_blank = True
    else:
        cleaned.append(line)
        prev_blank = False

with open("the_rubaiyat_clean.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(cleaned))

print("Cleaning done. Saved as the_rubaiyat_clean.txt")


Cleaning done. Saved as the_rubaiyat_clean.txt


In [1]:
# Transform Rubáiyát plain text to TEI-like blocks (no functions)

import html
from pathlib import Path

# === Input / Output files ===
in_path  = Path("sources/the_rubaiyat_clean.txt")   # use your cleaned file; change if needed
out_path = Path("rubaiyat_body.xml")        # will contain the <body> ... </body> fragment

# === Read and normalize lines ===
text = in_path.read_text(encoding="utf-8")
# Normalize newlines and strip trailing spaces
lines = [ln.strip() for ln in text.replace("\r\n", "\n").replace("\r", "\n").split("\n")]

# === Split into stanzas (blank line = new stanza). Collapse multiple blanks. ===
stanzas = []
current = []
for ln in lines:
    if ln == "":
        if current:
            stanzas.append(current)
            current = []
    else:
        current.append(ln)
if current:
    stanzas.append(current)

# Fallback: if the file had no blank lines but it *is* quatrains, chunk in 4s
if len(stanzas) == 1 and len(stanzas[0]) > 4:
    flat = stanzas[0]
    stanzas = [flat[i:i+4] for i in range(0, len(flat), 4)]

# === Build XML fragment ===
parts = []
parts.append("<body>")

for i, stanza in enumerate(stanzas, start=1):
    parts.append(f'  <div type="rubaiyat" n="{i}">')
    parts.append('    <lg type="rubai">')
    for l in stanza:
        safe = html.escape(l, quote=True)
        parts.append(f"      <l>{safe}</l>")
    parts.append("    </lg>")
    parts.append("  </div>")

parts.append("</body>")

xml_body = "\n".join(parts)
out_path.write_text(xml_body, encoding="utf-8")

print(f"Done. Wrote {out_path}")


Done. Wrote rubaiyat_body.xml
