In [1]:
# In this notebook, I extract the data provided by the GLAUX project
# Data: Septuaginta, Exodus

In [2]:
import pandas as pd
import xml.etree.ElementTree as ET

In [3]:
# Load the XML file

tree = ET.parse("data/0527-002.xml")
root = tree.getroot()

In [6]:
# Create a dictionary to store data
data = {}

# Loop through each word element
for word in root.findall(".//word"):
    # Extract chapter and verse_num from div_section
    div_section = word.get('div_section')  # e.g., "1.1"
    chapter, verse_num = map(int, div_section.split('.'))  # Split "1.1" into integers

    # Extract form
    form = word.get('form')

    # Use (chapter, verse_num) as a key in the data dictionary
    if (chapter, verse_num) not in data:
        data[(chapter, verse_num)] = []  # Initialize a list for this chapter/verse

    # Append the form to the list for this chapter/verse
    data[(chapter, verse_num)].append(form)

# Now, let's prepare the data for the DataFrame
rows = []
# Join lemmas, but don't add a space before punctuation
for (chapter, verse_num), forms in data.items():
    verse = ''
    for form in forms:
        if form in [',', '.', ';', ':']:  # Add more punctuation if needed
            verse = verse.rstrip() + form  # Remove the trailing space and add the punctuation
        else:
            verse += ' ' + form  # Add a space before the form

    verse = verse.strip()  # Strip any leading or trailing spaces
    
    rows.append({'chapter': chapter, 'verse_num': verse_num, 'verse': verse})

# Create a DataFrame
df = pd.DataFrame(rows)

# Display the DataFrame
df.head(2)

Unnamed: 0,chapter,verse_num,verse
0,1,1,ταῦτα τὰ ὀνόματα τῶν υἱῶν Ἰσραὴλ εἰσ...
1,1,2,"Ρουβήν, Συμεών, Λευΐ, Ἰούδας,"


In [8]:
df.to_csv("data/exodus_lxx.csv", index=False)

In [None]:
d