# Data Preparation: Bible in Amis and English

In [None]:
# Mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import re
import glob

## Step 1: Get data and put it into the right format

In [None]:
# Create lists of all files in Amis and English
bible_files_am = glob.glob("/content/drive/MyDrive/FormosanResources/Bible_formo/Amis/*.txt")
bible_files_en = glob.glob("/content/drive/MyDrive/FormosanResources/Bible_formo/English/*.txt")

# Sort file names in alphabetical order
bible_files_am = sorted(bible_files_am)
bible_files_en = sorted(bible_files_en)

# Get list of just file names for later
filenames = [re.sub("/content/drive/MyDrive/FormosanResources/Bible_formo/Amis/", "", name) for name in bible_files_am]
filenames = [re.sub("AMIBSTN1DA.txt", "", name) for name in filenames]

In [None]:
print(filenames)
print(len(filenames))

In [None]:
# Check first and last file names in each list
print(bible_files_am[0])
print(bible_files_en[0])
print(bible_files_am[-1])
print(bible_files_en[-1])

# Check length of each list
print(len(bible_files_am))
print(len(bible_files_en))

In [None]:
# Check for missing files
am_list = []
en_list = []
for i in range(len(bible_files_am)):
  am = bible_files_am[i]
  am = re.sub("/content/drive/MyDrive/FormosanResources/Bible_formo/Amis/", "", am)
  am = re.sub("AMIBSTN1DA.txt", "", am)
  am_list.append(am)
for i in range(len(bible_files_en)):
  en = bible_files_en[i]
  en = re.sub("/content/drive/MyDrive/FormosanResources/Bible_formo/English/", "", en)
  en = re.sub("ENGNIVN1DA.txt", "", en)
  en_list.append(en)
for i in range(len(bible_files_am)):
  if am_list[i] not in en_list:
    print("In Amis but not English:", am_list[i])
for i in range(len(bible_files_en)):
  if en_list[i] not in am_list:
    print("In English but not Amis:", en_list[i])

In [None]:
# For each story, combine the Amis and English into a list
# This way we can covert it into XML
bible_big_list = []

for bible_file in bible_files_am:
  # The order of elements will be
  # Amis (standardized orthography), Amis (original), English
  bible_list = []
  filename = bible_file[:-14]

  # AMIS
  # Open and read Amis file
  f1 = open(bible_file, "r")
  file_am = f1.read()

  # Remove any trailing whitespace
  file_am = file_am.strip()
  # Get rid of numbers at beginning of each line
  file_am = re.sub(r"\*\*[0-9]+\*\*", "", file_am)
  # Remove double, triple newlines
  file_am = re.sub(r"\n+[\s\n]*", "\n", file_am)
  # Split on newline
  list_am = file_am.split("\n")
  # Strip leading spaces
  list_am = [sentence.lstrip() for sentence in list_am]

  # Clean the Amis data for standardized orthography
  # Remove punctuation
  file_am_standard = re.sub(r"[\\_,\[\]\(\);/<>]+", "", file_am)
  file_am_standard = re.sub(r"[’‘]", "'", file_am_standard)
  # Make lowercase
  file_am_standard = file_am_standard.lower()
  # Replace all sentence-final punctuation with newline
  #file_am = re.sub(r"[\.\!\?]", "\n", file_am)
  # Remove double, triple newlines
  file_am = re.sub(r"\n+", "\n", file_am)
  # Amis with standardized orthography (replace o with u)
  file_am_standard = re.sub("o", "u", file_am_standard)

  # Split on newline
  list_am_standard = file_am_standard.split("\n")
  # Strip leading spaces
  list_am_standard = [sentence.lstrip() for sentence in list_am_standard]

  # Add both Amis strings to the list
  bible_list.append(list_am_standard)
  bible_list.append(list_am)

  # ENGLISH
  # Open and read English file
  filename = re.sub("AMIBSTN1DA.txt", "ENGNIVN1DA.txt", bible_file)
  filename = re.sub("/Amis/", "/English/", filename)
  f2 = open(filename, "r")
  file_en = f2.read()

  # Remove any trailing whitespace
  file_en = file_en.strip()
  # Get rid of numbers at beginning of each line
  file_en = re.sub(r"\*\*[0-9]+\*\*", "", file_en)
  # Remove double, triple newlines
  file_en = re.sub(r"\n+", "\n", file_en)
  # Split on newline
  list_en = file_en.split("\n")

  # Add English string to the list
  bible_list.append(list_en)

  # Add the list to the big list
  bible_big_list.append(bible_list)

In [None]:
# Check length of big list
print(len(bible_files_am))
print(len(bible_big_list))

# Check to make sure all languages have the same number of lines
for i in range(len(bible_big_list)):
  story = bible_big_list[i]
  if not (len(story[0]) == len(story[1]) == len(story[2])):
    print(bible_files_am[i])
    print(len(story[0]))
    print(len(story[1]))
    print(len(story[2]))


In [None]:
# Check to make sure it lines up
print(bible_big_list[0][0][:3])
print(bible_big_list[0][1][:3])
print(bible_big_list[0][2][:3])
print()

print(bible_big_list[1][0][:3])
print(bible_big_list[1][1][:3])
print(bible_big_list[1][2][:3])
print()

print(bible_big_list[-1][0][:3])
print(bible_big_list[-1][1][:3])
print(bible_big_list[-1][2][:3])

# Check for empty strings at the end
for i in range(5):
  print(bible_big_list[i][0][-1])
  print(bible_big_list[i][1][-1])
  print(bible_big_list[i][2][-1])

In [None]:
# Now our Bible data is ready to for XML
# The format of bible_big_list is as follows
# bible_big_list[story][language][sentence]
print(len(bible_big_list))

## Step 2: XML

In [None]:
import xml.etree.ElementTree as ET
import xml.dom.minidom

In [None]:
# This is the code for putting both the Amis and English translations into XML
# For each story, create a root and a TEXT subelement under root
for i in range(len(bible_big_list)):
  root = ET.Element("root")
  text_element = ET.SubElement(root, "TEXT", {"xml:lang": "amis1246", "source": f"bible_{filenames[i]}"})

  # For each sentence in the story, create the S element and its child elements
  for j in range(len(bible_big_list[i][0])):

    # First get the FORM (Amis sentence in standard orthography)
    s_element = ET.SubElement(text_element, "S", id=f"S{j}")
    form_element = ET.SubElement(s_element, "FORM", kindOf="standard")
    form_element.text = bible_big_list[i][0][j]

    # Now get the Amis in original orthography
    form_element = ET.SubElement(s_element, "FORM", kindOf="original")
    form_element.text = bible_big_list[i][1][j]

    # Now get the English
    transl_element = ET.SubElement(s_element, "TRANSL", {"xml:lang": "stan1293"})
    try:
      transl_element.text = bible_big_list[i][2][j]

    # Now get the words (from the standard Amis sentences)
      words = bible_big_list[i][0][j].split()
      for k in range(len(words)):
        w_element = ET.SubElement(s_element, "W", id=f"S{j}W{k}")
        wform_element = ET.SubElement(w_element, "FORM")
        wform_element.text = re.sub(r"[.?!]", "", words[k])
    except:
      print("story=", filenames[i], "sentence=", j)

  # Generate XML tree
  tree = ET.ElementTree(root)

  # This is what will make the XML look "pretty" (readable,
  # with newlines, indenting) when you write it out to a file.
  xml_str = xml.dom.minidom.parseString(ET.tostring(root)).toprettyxml(indent="    ")

  # This does the writing to the file.
  with open(f"/content/drive/MyDrive/Part1/BibleV2/bible_{filenames[i]}.xml", "w", encoding="utf-8") as f:
    f.write(xml_str)