<a href="https://colab.research.google.com/github/maggieberkley/amis-voice-classifier/blob/main/NTU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preparation: NTU Corpus in Amis, English, Chinese, and glossing

In [None]:
# Mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import re
import glob

## Step 1: Get data and put it into the right format

In [None]:
# Create lists of all files in Amis, English, Chinese, and glossing
ntu_files_am = glob.glob("/content/drive/MyDrive/FormosanResources/NTU_Formosan/Amis/*.fo")
ntu_files_en = glob.glob("/content/drive/MyDrive/FormosanResources/NTU_Formosan/Amis/*.en")
ntu_files_zh = glob.glob("/content/drive/MyDrive/FormosanResources/NTU_Formosan/Amis/*.zh")
ntu_files_gloss = glob.glob("/content/drive/MyDrive/FormosanResources/NTU_Formosan/Amis/*.gloss")

# Sort file names in alphabetical order
ntu_files_am = sorted(ntu_files_am)
ntu_files_en = sorted(ntu_files_en)
ntu_files_zh = sorted(ntu_files_zh)
ntu_files_gloss = sorted(ntu_files_gloss)

# Get list of just file names for later
filenames = [re.sub("/content/drive/MyDrive/FormosanResources/NTU_Formosan/Amis/", "", name) for name in ntu_files_am]
filenames = [re.sub(".fo", "", name) for name in filenames]

In [None]:
print(filenames)
print(len(filenames))
print(filenames[1])

['Amis_Conv-farming_marang_furayang', 'Amis_Conv-talking_tamih_panay', 'Amis_Nr-crab_panay', 'Amis_Nr-custom_Liu', 'Amis_Nr-frog_cinhua', 'Amis_Nr-frog_minzhu', 'Amis_Nr-frog_ofad', 'Amis_Nr-frog_zuomei', 'Amis_Nr-intro_Panay', 'Amis_Nr-intro_ofad', 'Amis_Nr-intro_tamih', 'Amis_Nr-peanut_panay', 'Amis_Nr-pear_cinhua', 'Amis_Nr-pear_lungi', 'Amis_Nr-pear_minzhu', 'Amis_Nr-pear_panay', 'Amis_Nr-pear_tamih', 'Amis_Nr-pear_zuomei']
18
Amis_Conv-talking_tamih_panay


In [None]:
# Check first and last file name in each list
print(ntu_files_am[0])
print(ntu_files_en[0])
print(ntu_files_zh[0])
print(ntu_files_gloss[0])

print(ntu_files_am[-1])
print(ntu_files_en[-1])
print(ntu_files_zh[-1])
print(ntu_files_gloss[-1])

# Check length of each list
print(len(ntu_files_am))
print(len(ntu_files_en))
print(len(ntu_files_zh))
print(len(ntu_files_gloss))

/content/drive/MyDrive/FormosanResources/NTU_Formosan/Amis/Amis_Conv-farming_marang_furayang.fo
/content/drive/MyDrive/FormosanResources/NTU_Formosan/Amis/Amis_Conv-farming_marang_furayang.en
/content/drive/MyDrive/FormosanResources/NTU_Formosan/Amis/Amis_Conv-farming_marang_furayang.zh
/content/drive/MyDrive/FormosanResources/NTU_Formosan/Amis/Amis_Conv-farming_marang_furayang.gloss
/content/drive/MyDrive/FormosanResources/NTU_Formosan/Amis/Amis_Nr-pear_zuomei.fo
/content/drive/MyDrive/FormosanResources/NTU_Formosan/Amis/Amis_Nr-pear_zuomei.en
/content/drive/MyDrive/FormosanResources/NTU_Formosan/Amis/Amis_Nr-pear_zuomei.zh
/content/drive/MyDrive/FormosanResources/NTU_Formosan/Amis/Amis_Nr-pear_zuomei.gloss
18
18
18
18


In [None]:
# For each story, combine the Amis, English, Chinese, and gloss into a list
# This way we can convert it into XML
ntu_big_list = []
for ntu_file in ntu_files_am:

  # The order of elements will be
  # Amis (standardized orthography), Amis, English, Chinese, gloss
  ntu_list = []

  # Amis file (we're not going to clean this one)
  filename = ntu_file[:-2]
  f1 = open(ntu_file, "r")
  file_am = f1.read().strip("\n")

  # Split on newline
  list_am = file_am.split("\n")
  # Strip leading spaces
  list_am = [sentence.lstrip() for sentence in list_am]

  # Clean the Amis data for standardized orthography
  file_am_standard = re.sub(r"[\\_,.\[\]]+", "", file_am)
  file_am_standard = re.sub("==", "", file_am_standard)
  file_am_standard = re.sub(r"[’‘]", "'", file_am_standard)
  file_am_standard = re.sub(r"<L2\w", "", file_am_standard)
  file_am_standard = re.sub(r"L2\w>", "", file_am_standard)
  # Amis with standardized orthography (replace o with u)
  file_am_standard = re.sub("o", "u", file_am_standard)

  # Split on newline
  list_am_standard = file_am_standard.split("\n")
  # Strip leading spaces
  list_am_standard = [sentence.lstrip() for sentence in list_am_standard]

  # Add standard Amis then original Amis to list
  ntu_list.append(list_am_standard)
  ntu_list.append(list_am)

  # English file
  f2 = open(filename + "en", "r")
  file_en = f2.read().strip("\n")
  list_en = file_en.split("\n")
  list_en = [sentence.lstrip() for sentence in list_en]
  ntu_list.append(list_en)

  # Chinese file
  f3 = open(filename + "zh", "r")
  file_zh = f3.read().strip("\n")
  list_zh = file_zh.split("\n")
  list_zh = [sentence.lstrip() for sentence in list_zh]
  ntu_list.append(list_zh)

  # Glossing
  f4 = open(filename + "gloss", "r")
  file_gloss = f4.read().strip("\n")
  list_gloss = file_gloss.split("\n")
  list_gloss = [sentence.lstrip() for sentence in list_gloss]
  ntu_list.append(list_gloss)

  ntu_big_list.append(ntu_list)

In [None]:
# Check to make sure it lines up
print(ntu_big_list[0][0][:3])
print(ntu_big_list[0][1][:3])
print(ntu_big_list[0][2][:3])
print(ntu_big_list[0][3][:3])
print(ntu_big_list[0][4][:3])
print()

print(ntu_big_list[-1][0][:3])
print(ntu_big_list[-1][1][:3])
print(ntu_big_list[-1][2][:3])
print(ntu_big_list[-1][3][:3])
print(ntu_big_list[-1][4][:3])

["e ma-sa-ma'an=tu kiya panay tu pi-lisu' isu tayra i uma'", "ca=hu kaku pi-lisu'", "pi-ala haca XX aku tuni kalitang la'enu nuya XX lupas"]
['e ma-sa-ma’an=tu kiya panay,\\ tu pi-liso’ isu,\\ tayra i oma’.\\', 'ca=ho kaku pi-liso’.\\', '[pi-ala haca],\\ [XX],\\ aku tuni kalitang,\\ [la’enu nuya==],\\ [XX],\\ lopas.\\']
['What happened to the rice last time you went to the farm?', 'I haven’t checked yet.', 'I only picked the snap beans under the peach trees.']
['你去田裡看水稻怎麼樣了。', '我還沒有去看。', '只有去摘長豆，就在桃樹下。']
['FIL AF-SA-what=PFV that.NOM rice OBL PI-see 2SG.GEN go LOC farm', 'NEG=IMPFV 1SG.NOM PI-see', 'PI-take PART XX 1SG.GEN this.OBL snap.bean below that.GEN XX peach']

["su'elin kita u ma u ma-lingad-ay", 'ira ku ccay a tamdaw ma-lingad tayni i lutuk', "su'elin mi-u'uk sa ku 'ayam ma-nengneng nira hay uya tayal aku sa'an l<um>uwad=tu cingra a tayni i umah nira"]
['su’elin kita u==,\\ ma u ma-lingad-ay.\\', 'ira ku ccay a tamdaw.\\ ma-lingad tayni i lutuk.\\', 'su’elin mi-o’o==k,\\ sa ku

In [None]:
# Check length of big list
print(len(ntu_files_am))
print(len(ntu_big_list))

# Check to make sure all languages have the same number of lines
for i in range(len(ntu_big_list)):
  story = ntu_big_list[i]
  if not (len(story[0]) == len(story[1]) == len(story[2]) == len(story[3]) == len(story[4])):
    print(ntu_files_am[i])
    print(len(story[0]))
    print(len(story[1]))
    print(len(story[2]))
    print(len(story[3]))
    print(len(story[4]))

18
18


In [None]:
# Now our NTU data is ready for XML
# Each source list contains multiple text lists
# Each text list includes a list for each language
# Each list for each language contains sentences

# The format of ntu_big_list is as follows
# ntu_big_list[story][language][sentence]

## Step 2: XML


In [None]:
import xml.etree.ElementTree as ET
import xml.dom.minidom

In [None]:
# This is the code for putting both the Amis and English translations into XML
# For each story, create a root and a TEXT subelement under root
for i in range(len(ntu_big_list)):
  root = ET.Element("root")
  text_element = ET.SubElement(root, "TEXT", {"xml:lang": "amis1246", "source": f"ntu_{filenames[i]}", "audio": f"{filenames[i]}.mp3"})

  # For each sentence in the story, create the S element and its child elements
  for j in range(len(ntu_big_list[i][0])):

    # First get the FORM (Amis sentence in standard orthography)
    s_element = ET.SubElement(text_element, "S", id=f"S{j}")
    form_element = ET.SubElement(s_element, "FORM", kindOf="standard")
    std_sentence = ntu_big_list[i][0][j]
    std_sentence = re.sub(r"[-=<>]", "", std_sentence)
    form_element.text = std_sentence

    # Now get the Amis in original orthography
    form_element = ET.SubElement(s_element, "FORM", kindOf="original")
    form_element.text = ntu_big_list[i][1][j]

    # Now get the English
    transl_element = ET.SubElement(s_element, "TRANSL", {"xml:lang": "stan1293"})
    transl_element.text = ntu_big_list[i][2][j]

    # Now get the Chinese
    transl_element = ET.SubElement(s_element, "TRANSL", {"xml:lang": "mand1415"})
    transl_element.text = ntu_big_list[i][3][j]

    # Now get the glossing
    transl_element = ET.SubElement(s_element, "TRANSL", {"xml:lang": "gloss"})
    ntu_big_list[i][4][j] = re.sub("\s\s+", " ", ntu_big_list[i][4][j])
    transl_element.text = ntu_big_list[i][4][j]

    # Now get the words (from the standard Amis sentences)
    words = ntu_big_list[i][0][j].split()
    glossing = ntu_big_list[i][4][j].split()

    for k in range(len(words)):
      w_element = ET.SubElement(s_element, "W", id=f"S{j}W{k}")
      wform_element = ET.SubElement(w_element, "FORM")
      std_word = words[k]
      std_word = re.sub(r"[-=<>]", "", std_word)
      std_word = re.sub(r"[.!?]", "", std_word)
      wform_element.text = std_word

      # Now get the morphemes for each word
      # Usually separated by <>, =, -, or space
      # Split on =, –, and space
      morphs = re.split(r"[=\-\s]", words[k])
      glosses = re.split(r"[=\-\s]", glossing[k])
      # Deal with infixes
      morph_match = re.search(r"<.+>", words[k])
      gloss_match = re.search(r"<.+>", glossing[k])
      if morph_match:
        morph_infix = morph_match.group()
        morph_infix_clean = re.sub("<", "", morph_infix)
        morph_infix_clean = re.sub(">", "", morph_infix_clean)
        morphs.insert(0, morph_infix_clean)
        for m in range(len(morphs)):
          morphs[m] = re.sub(morph_infix, "", morphs[m])
      if gloss_match:
        gl_infix = gloss_match.group()
        gl_infix_clean = re.sub("<", "", gl_infix)
        gl_infix_clean = re.sub(">", "", gl_infix_clean)
        glosses.insert(0, gl_infix_clean)
        for m in range(len(glosses)):
          glosses[m] = re.sub(gl_infix, "", glosses[m])

      for m in range(len(morphs)):
        m_element = ET.SubElement(w_element, "M", id=f"S{j}W{k}M{m}")
        # Create FORM within each morpheme
        mform_element = ET.SubElement(m_element, "FORM")
        mform_element.text = morphs[m]
        # Create TRANS for gloss within morpheme
        try:
          glosstransl_element = ET.SubElement(m_element, "TRANSL")
          glosstransl_element.text = glosses[m]
        except:
          print(filenames[i], ", Sentence:", j, ", Word:", k, "Morpheme:", m)

  # Generate XML tree
  tree = ET.ElementTree(root)

  # This is what will make the XML look "pretty" (readable,
  # with newlines, indenting) when you write it out to a file.
  xml_str = xml.dom.minidom.parseString(ET.tostring(root)).toprettyxml(indent="    ")

  # This does the writing to the file.
  with open(f"/content/drive/MyDrive/Part1/NTUV4/ntu_{filenames[i]}.xml", "w", encoding="utf-8") as f:
    f.write(xml_str)

Amis_Conv-talking_tamih_panay , Sentence: 19 , Word: 13 Morpheme: 1
Amis_Conv-talking_tamih_panay , Sentence: 19 , Word: 13 Morpheme: 2
Amis_Nr-frog_ofad , Sentence: 2 , Word: 2 Morpheme: 1
Amis_Nr-intro_tamih , Sentence: 10 , Word: 2 Morpheme: 2
