In [None]:
# Mount your Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# STEP 1: Parse NTU Corpus XML and organize data
In this section, we parse the NTU XML files. Using the glosses, we attempt to determine which words are verbs and we attempt to determine their voice. We create a list of all unique verbs, all words in each voice, and all verb stems that we have identified.

In [None]:
# Import statements
import xml.etree.ElementTree as ET
import re
import glob
from operator import setitem

In [None]:
# Make a list of XML files for the NTU data
xml_files = glob.glob("/content/drive/MyDrive/Part1/NTUV4/*")
print(xml_files[0])

/content/drive/MyDrive/Part1/NTUV4/ntu_Amis_Conv-farming_marang_furayang.xml


In [None]:
# Make list of glosses that indicate verbs
# These glosses indicate the following:
# agent focus, patient focus, location focus, instrument focus, perfective,
# pi-, Consonant reduplication with /a/, ka-, sa-, causitive, a-
verb_markers = ["AF", "PF", "LF", "IF", "PFV", "PI", "Ca", "KA", "SA", "CAU", "A"]

In [None]:
verb_list = []
word_list = []
gloss_list = []
verb_stem_list = dict()

av_list = []
pv_list = []
lv_list = []
iv_list = []

a_dict = dict()
p_dict = dict()
l_dict = dict()
i_dict = dict()

# For each XML file
for xmlfile in xml_files:

  # Read in and parse the XML file
  tree = ET.parse(xmlfile)
  root = tree.getroot()
  file_words = []

  for S in root.iter('S'):
    sent_words = []

    # For each word in the file
    for W in S.iter('W'):

      # Get the word
      word = W.find('FORM').text

      # Add word to list of words, except for XX
      if word != "XX":
        sent_words.append(word)

      # Initialize flags for verb and voice
      verb_check = 0
      av_check = 0
      pv_check = 0
      lv_check = 0
      iv_check = 0

      sa = 0
      an = 0
      mi = 0
      ma = 0
      aw = 0
      pi = 0
      ka = 0

      # Check for false start, lexical glosses, and the gloss "Amis"
      fs = 0
      lexical = 0
      amis = 0

      # For each morpheme in the word
      for M in W.iter('M'):

        # Get the gloss
        gloss=M.find("TRANSL").text
        if gloss==None:
          continue

        # Add gloss to list of glosses
        # We used this code to find out which glosses appear in the NTU Corpus
        if gloss not in gloss_list:
          gloss_list.append(gloss)

        if gloss.islower():
          stem=M.find("FORM").text

        # Check for certain markers that may indicate voice
        # These morphemes appear in prefix-suffix sets in words
        # So we need to check for combinations of them at the word level
        # But we only have glosses by morpheme in the XML
        # So we use these flags to track the markers on the word level
        if re.search(r"SA", gloss):
          sa = 1
        if re.search(r"AN", gloss):
          an = 1
        if re.search(r"MI", gloss):
          mi = 1
        if re.search(r"MA", gloss):
          ma = 1
        if re.search(r"AW", gloss):
          aw = 1
        if re.search(r"PI", gloss):
          pi = 1
        if re.search(r"KA", gloss):
          ka = 1
        if re.search(r"FS", gloss):
          fs = 1
        if re.search(r"[a-z]", gloss):
          lexical = 1
        if gloss == "Amis":
          amis = 1

        # Check if the morpheme is a verb marker
        for marker in verb_markers:
          if re.search(marker,gloss):
            verb_check=1

        # Check if verb has SA and PI or SA and KA
        # If it just has SA, it forms a superlative
        # If it has SA with PI, KA, AN, or AW, it's a verb
        if sa:
          if pi or ka or an or aw:
            verb_check=1
          else:
            verb_check=0

        # Check if it's a nominalized verb (with suffix -ay)
        # We don't want to count nominalized verbs as verbs
        if re.search(r"AY",gloss):
          verb_check=0

        # Check which voice the word might fall into
        # Our reasons for choosing these combinations of morphemes can be found in the notes sheet
        # Check for AGENT VOICE
        if re.search(r"AF", gloss) or (mi and not an):
          av_check = 1

        # Check for PATIENT VOICE
        if re.search(r"PF", gloss) or (mi and an) or (sa and aw):
          pv_check = 1

        # Check for LOCATIVE VOICE
        if re.search(r"LF", gloss) or (an and not mi):
          lv_check = 1

        # Check for INSTRUMENTAL VOICE
        if re.search(r"IF", gloss) or (sa and pi) or (sa and ka):
          iv_check = 1

      # Add words to verb and voice lists

      # Add to VERB list if
      # it has verb markers, has a lexical gloss, and is not a false start
      if verb_check == 1 and lexical and not fs and not amis:
        verb_list.append(word)
        # Add to stem list
        if stem not in verb_stem_list:
          verb_stem_list[stem]={word}
        else:
          verb_stem_list[stem].add(word)

      # Add to AGENT VOICE list
      if av_check == 1:
        av_list.append(word)
        if stem not in a_dict:
          a_dict[stem]=1
        else:
          a_dict[stem]+=1

      # Add to PATIENT VOICE list
      if pv_check == 1:
        pv_list.append(word)
        if stem not in p_dict:
          p_dict[stem]=1
        else:
          p_dict[stem]+=1

      # Add to LOCATIVE VOICE list
      if lv_check == 1:
        lv_list.append(word)
        if stem not in l_dict:
          l_dict[stem]=1
        else:
          l_dict[stem]+=1

      # Add to INSTRUMENTAL VOICE list
      if iv_check == 1:
        iv_list.append(word)
        if stem not in i_dict:
          i_dict[stem]=1
        else:
          i_dict[stem]+=1

    # Add sentence of words to words in file
    file_words.append(sent_words)

  # Add word to word list
  word_list.append(file_words)


sorted_a_dict = dict(sorted(a_dict.items(), key=lambda x:x[1], reverse=True))
sorted_p_dict = dict(sorted(p_dict.items(), key=lambda x:x[1], reverse=True))
sorted_l_dict = dict(sorted(l_dict.items(), key=lambda x:x[1], reverse=True))
sorted_i_dict = dict(sorted(i_dict.items(), key=lambda x:x[1], reverse=True))

In [None]:
print(verb_stem_list)
print(len(verb_stem_list))

print(sorted_a_dict)
print(len(sorted_a_dict))
print(sorted_p_dict)
print(len(sorted_p_dict))
print(sorted_l_dict)
print(len(sorted_l_dict))
print(sorted_i_dict)
print(len(sorted_i_dict))

{"lisu'": {"pilisu'", "milisu'"}, 'ala': {'alahan', 'mialatu', 'mialaay', 'miala', 'piala', 'alaan', 'alaantu', 'mialaan', 'sapialaan', 'kaala', 'maalatu', 'maala', 'nialaan', 'alahantu'}, 'urad': {'maurad', 'kalaurad', 'mamaurad', 'pipakaurad', 'pakaurad'}, 'fangcal': {'fangcalitu', 'kafangcal', 'fangcaltu'}, 'nengneng': {'minengnengan', 'nengnenghanen', 'pinengneng', 'minengneng', 'adihay', 'nengnenghantu', 'kanengneng', 'nengnengitu', 'sanengnenghan', 'sanengnengnengnenghan', 'manengneng', 'papinengneng', 'nengnengen', 'nengnenghan'}, "ma'an": {'mami', "masama'antu", "misama'ama'an", "mima'an", "mama'an", "sapima'an", "ma'anen"}, "ha'en": {"mamaha'en", "maha'en"}, 'suwal': {'sasuwalen', 'sumuwal', 'pasuwalhantu', 'Amis'}, 'sadak': {'masadaktu', 'pasadak', 'masadasadak', 'masadak'}, 'enem': {'sakaenem'}, 'panay': {'mipanaytu', 'mipanay'}, 'sulinga': {'sulingaen', 'sulingatu'}, 'cuwa': {'cuwacuwatu', 'cuwatu', 'talacuwaen', 'talacuwatu'}, 'itira': {'itiraen', 'tuninian', 'kaitira'}, '

In [None]:
just_verb_stems = list(verb_stem_list.keys())
print(just_verb_stems)

["lisu'", 'ala', 'urad', 'fangcal', 'nengneng', "ma'an", "ha'en", 'suwal', 'sadak', 'enem', 'panay', 'sulinga', 'cuwa', 'itira', 'pawali', 'licay', 'icuwa', 'lecad', "nga'ay", 'tusa', 'liaw', 'ulung', 'tayra', 'acukemas', 'kaen', 'hiya', 'ruray', 'paluma', 'sikul', 'smut', 'nengneng/', 'tiwas', 'puhed', 'cucuk', 'kunira', 'iw', 'matira', 'faw', 'edeng', 'mutep', 'fala', 'nukay', "paluma'", 'munengaw', "cu'ung", 'satahepu', "tu'ur", 'tayni', 'sanga', "'eci", 'harek', 'pina', 'lima', "'aca", 'funsia', 'harateng', 'keter', 'tayal', 'emin', 'kadafu', 'patiku', 'herek', 'liyaw', "pela'", 'haen', "sanga'", 'lepun', 'anini', 'pacuk', "ati'", 'lahuk', 'tadem', 'rulay', 'ulah', 'fukil', 'heci', 'tenes', "fana'", 'ngaay', 'tama', 'tapang', 'tudung', 'hatira', 'filu', 'falic', 'sa', 'tengil', 'tusuk', 'nanam', 'kingkiw', "sulinga'", 'kinkiw', "'ayaw", 'ikur', 'tatudung', "tu'as", "ka'en", 'tangtang', 'fuhad', 'kalang', 'kalat', 'ngangan', 'itiya', 'tapuh', 'cidal', 'lingad', 'kedal', "'urad", "la

In [None]:
# We now have a nested list of the lists of all words in each of the sentences in each of the 18 NTU files
# word_list = [[[all words in file 1 sentence 1], [words in file 1 sentence 2]], ...]

# Print list of all words
print(word_list)
print(len(word_list))

# And now we make a flattened list of all the words in all files
word_list_joined = []
for sublist in word_list:
  for sub in sublist:
    word_list_joined.extend(sub)
print(len(word_list_joined))

# Print list of all unique words
unique_word = set(word_list_joined)
print(unique_word)
print(len(unique_word))

[[['e', "masama'antu", 'kiya', 'panay', 'tu', "pilisu'", 'isu', 'tayra', 'i', "uma'"], ['cahu', 'kaku', "pilisu'"], ['piala', 'haca', 'aku', 'tuni', 'kalitang', "la'enu", 'nuya', 'lupas'], ['hay', 'matini', 'sa', 'kalaurad', 'latek', "ca'ay", 'kafangcal', 'ku', 'heci', 'nu', 'panay', 'sa', 'ku', 'maku', 'harateng'], ['anu', "masama'an", 'ku', 'misu', 'a', 'pinengneng', 'tura', 'panay', 'sa', 'ku', 'maku', 'harateng'], ['hay', 'a', "ma'anen", 'ku', "fana'", "samaha'enmaha'en", 'ku', 'aniniay', 'a', "miheca'an", 'a'], ['a', 'hiya', 'nu', 'nu', 'nu', 'rakat', 'nu', 'fali', 'ita', 'kira', "mamaha'en", 'haca'], ['u', 'ruma', 'satu', 'sasuwalen', 'aku', 'matini', 'u', 'masadaktu', 'ku', 'panay'], ['latek', 'itini', 'i', 'sakaenem', 'a', 'fulad'], ['alatek', 'a', 'mipanaytu', 'sa', 'ku', 'maku'], ['ta', 'sulingaen', 'ita', 'ku', 'malulalusidang', 'a', 'mipanay', 'u', 'pawutiwuti'], ['masadaktu', 'i', 'paputal', 'sa', 'ku', 'maku', 'harateng'], ['hay', "maha'en", 'ku', 'maku', 'harateng'], ['a

In [None]:
# Print list of words with verb markers
print(verb_list)
print(len(verb_list))

# Print list of unique words with verb markers
unique_verb = [element for element in set(verb_list)]
print(unique_verb)
print(len(unique_verb))

["pilisu'", "pilisu'", 'piala', 'kalaurad', 'kafangcal', 'pinengneng', "ma'anen", "mamaha'en", 'sasuwalen', 'masadaktu', 'sakaenem', 'mipanaytu', 'sulingaen', 'mipanay', 'masadaktu', "maha'en", 'mipanay', 'talacuwaen', 'itiraen', 'mipawali', 'milicay', 'icuwatu', 'malecadtu', "nga'aytu", 'sulingatu', 'tatusa', 'kaitira', 'pisaliawliaw', 'miulung', "maha'en", 'milicay', 'mipanay', 'patayra', 'acukemashan', 'kakaenen', "manga'aytu", "ma'anen", 'hiyatu', "manga'aytu", 'miala', 'karuray', 'mialatu', 'malukakaenen', "manga'aytu", "maha'en", 'nipalumaan', 'kasikul', 'tatusa', 'nipalumaan', 'masmut', 'tayratu', 'minengneng/', 'minengnengan', 'nengnenghan', 'tiwashan', 'mapuhed', 'tiwashan', 'mapuhed', 'micucuk', 'pakunirahantu', 'paiw', 'mamatiratira', 'mafaw', 'minengneng', 'kafangcal', 'kalafaw', 'paiw', "kanga'ay", 'alaan', 'maedeng', 'pakamutep', 'falahantu', 'maala', 'panukay', 'mami', 'sasuwalen', "nipaluma'an", 'munengawtu', "cicu'ungtu", 'alahantu', 'lukut', "patu'urhantu", "manga'ayt

In [None]:
# Print list of words with agent voice
print(set(av_list))
print(len(set(av_list)))

av_verbs=set(unique_verb).intersection(av_list)
print(av_verbs)
print("Number of AV verbs:",len(av_verbs))
# Print list of words with patient voice
print(set(pv_list))
print(len(set(pv_list)))

pv_verbs=set(unique_verb).intersection(pv_list)
print(pv_verbs)
print("Number of PV verbs:",len(pv_verbs))
# Print list of words with locative voice
print(set(lv_list))
print(len(set(lv_list)))

lv_verbs=set(unique_verb).intersection(lv_list)
print(lv_verbs)
print("Number of LV verbs:",len(lv_verbs))
# Print list of words with instrumental voice
print(set(iv_list))
print(len(set(iv_list)))

iv_verbs=set(unique_verb).intersection(iv_list)
print(iv_verbs)
print("Number of IV verbs:",len(iv_verbs))

{'misatapang', 'miawaw', "maduka'", 'malecadtu', 'misatapangtu', 'mialaay', 'tayni', 'miXX', 'mikalic', 'maaraw', 'misapuhpuhpuh', 'masarucud', 'malepuntu', 'mafaw', 'miliyas', "kuma'en", 'miiyuf', 'masadasadak', 'mitafsiw', 'matuastu', 'adada', 'mihayda', 'mahaen', 'misalisin', 'mikadafuaytu', 'misawad', "mira'ur", 'mipanay', 'misaheci', 'matalawtu', "misaluma'", 'mitulas', 'mikansia', 'matumesay', 'mipanaytu', "luma'uc", 'mahicurcur', 'miulung', 'malingad', 'mikalang', 'mi', 'ma', 'lumuwad', 'mamalemed', 'masiday', 'maepung', 'mifuhad', 'matiya', 'maepud', 'misaharaterateng', 'mifalic', 'mikulisiw', 'lalumuwadtu', 'misalifun', 'mapulin', 'tayra', 'masadaktu', 'masadakay', 'mipacuk', 'mitengiltu', 'matuasay', 'mialatu', 'matuas', 'cumikay', 'mirakat', 'adihaytu', 'mikatafu', 'miapid', 'mikilim', 'mamaurad', 'mifunsia', "ma'anuf", 'masmut', 'fangcalay', 'masa', "misaluma'ay", 'mitatuy', 'mipitpitay', 'mafuraw', "miu'uk", 'macuwatitu', 'malufic', 'minengneng/', 'machunpi', 'adihay', 'mi

In [None]:
# Print list of unique glosses
print(gloss_list)
print(len(gloss_list))

new_gloss_list = []

# Remove anything that is not an abbreviation (keep numbers)
for gloss in gloss_list:
  if "." in gloss:
    split_gloss = gloss.split(".")
    for g in split_gloss:
      new_gloss_list.append(g)
  else:
    new_gloss_list.append(gloss)

gloss_unique = sorted(set([g for g in new_gloss_list if g.isupper()]))
print(gloss_unique)
print(len(gloss_unique))


['FIL', 'AF', 'SA', 'what', 'PFV', 'that.NOM', 'rice', 'OBL', 'PI', 'see', '2SG.GEN', 'go', 'LOC', 'farm', 'NEG', 'IMPFV', '1SG.NOM', 'take', 'PART', 'XX', '1SG.GEN', 'this.OBL', 'snap.bean', 'below', 'that.GEN', 'peach', 'BC', 'this.way', 'KALA', 'rain', 'probably', 'KA', 'good', 'NOM', 'fruit', 'GEN', '1SG.POSS', 'thought', 'COND', '2SG.POSS', 'LNK', 'that.OBL', 'PF', 'know', 'this', 'now', 'AY', 'year', 'walk', 'weather', '1IPL.GEN', 'this.NOM', 'Ca', 'CN', 'other', 'speak', 'go.out', 'here', 'IF', 'six', 'month', 'then', 'arrange', 'become', 'goods', 'RED', 'sack', 'outside', '1IPL.NOM', 'HAW', 'DM', 'to', 'where', 'dry.in.the.sun', 'OPT', 'farmer’s.association', 'there', 'NCM.SG', 'PN', 'dry', 'ask', 'heart', 'no.problem', 'same.as', 'that.way', 'speech', '1IPL.POSS', 'two', 'again', 'carry', 'that', 'so', '2SG.OBL', 'TOP', 'FS', 'CAU', 'send', 'HAN.PF', 'eat', 'NCM.SG.NOM', 'sir', 'tired', 'house', 'grow', 'AN', 'take.care.of', 'weed', 'PAST', '2SG.NOM', 'west', 'branch', 'red', 

In [None]:
# Write all verbs from the NTU Corpus into a text file
with open("/content/drive/MyDrive/Part2/Verbs/verbs_NTU.txt", "w") as f:
  f.write(",".join(verb_list))

In [None]:
# Write each list of verbs by voice into a text file
with open("/content/drive/MyDrive/Part2/Verbs/av_verbs_NTU.txt", "w") as f:
  f.write(",".join(av_verbs))

with open("/content/drive/MyDrive/Part2/Verbs/pv_verbs_NTU.txt", "w") as f:
  f.write(",".join(pv_verbs))

with open("/content/drive/MyDrive/Part2/Verbs/lv_verbs_NTU.txt", "w") as f:
  f.write(",".join(lv_verbs))

with open("/content/drive/MyDrive/Part2/Verbs/iv_verbs_NTU.txt", "w") as f:
  f.write(",".join(iv_verbs))

In [None]:
# Write all verbs stems into a text file
with open("/content/drive/MyDrive/Part2/Verbs/stems_NTU.txt", "w") as f:
  f.write(",".join(just_verb_stems))

# STEP 2: Create feature vectors

In order to create a classifier for this data, we need to turn each word into a vector. We do this by assigning a unique index to each word in the NTU Corpus and creating a vector for each word that includes the following information: [ID of the word, ID of word before, ID of word two before, ID of word after, ID of word two after].

In [None]:
# Create dictionaries that map each word to an ID, and vice versa
# We can use this to go back and forth between IDs and characters
word_to_idx = {word: idx+1 for idx, word in enumerate(unique_word)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

In [None]:
# Check to make sure it worked
for i in range(1,6):
  print(i, idx_to_word[i], word_to_idx[idx_to_word[i]])

for i in range(1000,1006):
  print(i, idx_to_word[i], word_to_idx[idx_to_word[i]])

1 itiratu 1
2 mialaay 2
3 luma' 3
4 kunu 4
5 celi 5
1000 pitulas 1000
1001 usaw 1001
1002 pakawal 1002
1003 matumusay 1003
1004 maemintu 1004
1005 nu 1005


In [None]:
# Now use the ditionaries we created to create feature vectors
# We want the feature vectors to look like this:
# [ID of word, ID of word before, ID of word two before, ID of word after, ID of word two after]

# We want a list of lists of vectors with the following format:
# all_vecs = [file_vecs for first file, file_vecs for second file, ...]
# file_vecs = [sent_vecs for first sentence, sent_vecs for second file, ...]
# sent_vecs = [vec for first word, vec for second word, ...]
# vec = [ID of word, ID of word before, ID of word two before, ID of word after, ID of word two after]

# We also want a list of whether or not each word is in our list of verbs (1 if verb, 0 if not)

all_vecs = []
all_is_verb = []

# For each file in the corpus
for word_sublist in word_list:
  file_vecs = []
  file_is_verb = []
  # For each sentence in the file
  for sentence in word_sublist:
    sent_vecs = []
    is_verb = []

    # For each word in the sentence
    for i in range(len(sentence)):
      vec = []

    # ID of word
      vec.append(word_to_idx[sentence[i]])

    # ID of word before
      if i > 0:
        vec.append(word_to_idx[sentence[i-1]])
      else:
        vec.append(0)

    # ID of word two before
      if i > 1:
        vec.append(word_to_idx[sentence[i-2]])
      else:
        vec.append(0)

    # ID of word after
      if i < len(sentence) - 1:
        vec.append(word_to_idx[sentence[i+1]])
      else:
        vec.append(0)

    # ID of word two after
      if i < len(sentence) - 2:
        vec.append(word_to_idx[sentence[i+2]])
      else:
        vec.append(0)

    # If word is a verb, 1, else 0
      if sentence[i] in verb_list:
        is_verb.append([1])
      else:
        is_verb.append([0])

    # Append to list of vectors in sentence
      sent_vecs.append(vec)

    #Append
    file_vecs.append(sent_vecs)
    file_is_verb.append(is_verb)

  # Append to big list of vectors
  all_vecs.append(file_vecs)
  # Append to big list of is_verb
  all_is_verb.append(file_is_verb)

In [None]:
# Check the feature vectors and verb list

# Print vectors for words in first file
print(all_vecs[0])
print(len(all_vecs))

# Print verb/non-verb for words in first file
print(all_is_verb[0])
print(len(all_is_verb))

# Print vectors for words in first sentence in first file
print(all_vecs[0][0])
print(len(all_vecs[0][0]))

# Print verb/non-verb for words in first sentence in first file
print(all_is_verb[0][0])
print(len(all_is_verb[0][0]))


[[[313, 0, 0, 1277, 431], [1277, 313, 0, 431, 52], [431, 1277, 313, 52, 618], [52, 431, 1277, 618, 476], [618, 52, 431, 476, 164], [476, 618, 52, 164, 1230], [164, 476, 618, 1230, 1188], [1230, 164, 476, 1188, 1431], [1188, 1230, 164, 1431, 0], [1431, 1188, 1230, 0, 0]], [[505, 0, 0, 536, 476], [536, 505, 0, 476, 0], [476, 536, 505, 0, 0]], [[170, 0, 0, 817, 1201], [817, 170, 0, 1201, 398], [1201, 817, 170, 398, 1145], [398, 1201, 817, 1145, 20], [1145, 398, 1201, 20, 990], [20, 1145, 398, 990, 335], [990, 20, 1145, 335, 0], [335, 990, 20, 0, 0]], [[719, 0, 0, 1030, 506], [1030, 719, 0, 506, 678], [506, 1030, 719, 678, 972], [678, 506, 1030, 972, 1214], [972, 678, 506, 1214, 865], [1214, 972, 678, 865, 1114], [865, 1214, 972, 1114, 19], [1114, 865, 1214, 19, 1005], [19, 1114, 865, 1005, 52], [1005, 19, 1114, 52, 506], [52, 1005, 19, 506, 1114], [506, 52, 1005, 1114, 1211], [1114, 506, 52, 1211, 395], [1211, 1114, 506, 395, 0], [395, 1211, 1114, 0, 0]], [[226, 0, 0, 1094, 1114], [1094, 

# STEP 3: Create a classifier for verb vs. non-verb
Here we prepare the data to be used as train and test data. Then we experiment with four different types of classifiers: Logistic Regression, Decision Tree, Random Forest, and Gradient Boosting. We made some adjustments to the verb list between attempts, so that accounts for the differences in accuracy between attempts. We get the best accuracy from the Gradient Boosting classifier, so that is the classifier that we will use to predict verbs in the Bible data.

In [None]:
# Import statements
from sklearn.model_selection import train_test_split
import numpy
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# Prepare the data to use as train and test data

# Initialize array
np_vecs=numpy.array([[0,0,0,0,0]])

# Flatten all_vecs list
# For story in all_vecs
for i in range(len(all_vecs)):
  # For sentence in story
  for j in range(len(all_vecs[i])):
    # For word in sentence
    for vec in all_vecs[i][j]:
      np_vecs=numpy.append(np_vecs,[vec],axis=0)

# Remove first element (all zeroes)
np_vecs=np_vecs[1:]
print(np_vecs)
X=np_vecs

np_verbs=numpy.array([0])

# Flatten all_is_verb
# For story in all_is_verb
for i in range(len(all_is_verb)):
  # For sentence in story
  for j in range(len(all_is_verb[i])):
    # For word in sentence
    for is_verb in all_is_verb[i][j]:
      np_verbs=numpy.append(np_verbs,[is_verb])
print(np_verbs)
# Remove first element (zero)
y=np_verbs[1:]

# Partition the data
# We'll use 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

[[ 313    0    0 1277  431]
 [1277  313    0  431   52]
 [ 431 1277  313   52  618]
 ...
 [ 945  545  933 1114  978]
 [1114  945  545  978    0]
 [ 978 1114  945    0    0]]
[0 0 1 ... 1 0 0]


In [None]:
# Logistic Regression model
logregmodel = LogisticRegression()

# train model (i.e., fit X to y)
logregmodel.fit(X_train,y_train)

predictions = logregmodel.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8287



First attempt:
```
# Accuracy: 0.7004
```
Second attempt:

```
# Accuracy: 0.8287
```
Third attempt:

```
# Accuracy: 0.8287
```

In [None]:
#Decision tree model
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Initialize the DecisionTreeClassifier
clf = DecisionTreeClassifier()

# Train the classifier
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred_decision = clf.predict(X_test)

# Calculate accuracy
accuracy_decision = accuracy_score(y_test, y_pred_decision)
print(f"Accuracy: {accuracy_decision}")

Accuracy: 0.8544973544973545



First attempt:
```
# Accuracy: 0.8644179894179894
```
Second attempt:

```
# Accuracy: 0.8458994708994709
```
Third attempt:

```
# Accuracy: 0.8220899470899471
```



In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize the RandomForestClassifier
# Here we set a parameter for the number of trees (estimators).
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred_rf = rf_classifier.predict(X_test)

# Calculate accuracy
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy: {accuracy_rf}")

Accuracy: 0.8597883597883598


In [None]:
print(y_pred_rf)

[0 0 0 ... 0 0 0]


First attempt:

```
# Accuracy: 0.8498677248677249
```
Second attempt:

```
# Accuracy: 0.8544973544973545
```
Third attempt:

```
# Accuracy: 0.8571428571428571
```

In [None]:
# Gradient boosting
from sklearn.ensemble import GradientBoostingClassifier

# Initialize the DecisionTreeClassifier
# Hint: try changing the parameters to see what happens!
# We saw big improvements in class by changing the number of estimators.
gb_classifier = GradientBoostingClassifier(n_estimators=100,
                                           learning_rate=0.1,
                                           max_depth=3,
                                           random_state=42)

# Train the classifier
gb_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred_gb = gb_classifier.predict(X_test)

# Calculate accuracy
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print(f"Accuracy: {accuracy_gb}")

Accuracy: 0.8716931216931217


First attempt:

```
# Accuracy: 0.8333333333333334
```

Second attempt:

```
# Accuracy: 0.8617724867724867
```

Third attempt:

```
# Accuracy: 0.8703703703703703
```
Fourth attempt:

```
# Accuracy: 0.8716931216931217
```

# STEP 4: Running the classifier on the Bible data
In this section, we prepare the Bible data in a similar way to how we prepared the NTU data before. We create a list of feature vectors and run the Gradient Boosting classifier on it. Then we write the list of verbs in the Bible data to a text file.

In [None]:
# Make a list of XML files
bible_files = glob.glob("/content/drive/MyDrive/Part1/Bible/*")
print(bible_files[0])

/content/drive/MyDrive/Part1/Bible/bible_B01___01_Matthew_____.xml


In [None]:
bible_word_list = []

for biblefile in bible_files:

  # Read in and parse the XML file
  tree = ET.parse(biblefile)
  root = tree.getroot()
  bible_file_words = []

  # For sentence in file
  # The bible data is not split up by sentences, but by group of sentences
  # So we need to split each sentence up
  # And then split those sentences into words
  for S in root.iter('S'):
    bible_sent_words = []

    # Get standardized set of sentences
    sentences = S.find("FORM[@kindOf='standard']").text
    # Split on sentence-final punctuation to separate the sentences
    sentences_list = re.split(r"[.!?]", sentences)

    sents = []
    # For each sentence in the list
    for sent in sentences_list:
      # Split into words
      sents = sent.split()
      # Append words to list
      #sents.append(words)

      # For each word in the file
      for word in sents:

        # Add word to list of words in sentence
        bible_sent_words.append(word)

      # Add sentence of words to words in file
      bible_file_words.append(bible_sent_words)

  # Add word to word list
  bible_word_list.append(bible_file_words)

In [None]:
# Print list of all words
print(len(bible_word_list))
# Print first sentence of first file
print(bible_word_list[0][0])
# Print first word of first file
print(bible_word_list[0][0][0])

# And now we make a flattened list of all the words in all files
bible_word_list_joined = []
for sublist in bible_word_list:
  for sub in sublist:
    bible_word_list_joined.extend(sub)
print(len(bible_word_list_joined))

# Print list of all unique words
bible_unique_word = set(bible_word_list_joined)
print(bible_unique_word)
print(len(bible_unique_word))

260
['mahaenay', 'ku', 'kadadu^du', 'nu', "laluma'an", 'ni', 'yis', 'u', 'tluc', 'ni', 'apraham', 'u', 'tluc', 'ni', 'tafiti', 'u', 'kadadu^du', 'nu', "tatu'asan", 'ni', 'yis', 'kristu']
mahaenay
734116
{"mamatayi'", 'masasikahmek', 'sapipatayananay', "tna'", 'kaniharan', "'utuc", 'silasan', 'mamu', 'sapipadamaan', 'mapalifetay', 'mapaluwad', 'nialaan', 'sapipa^kel', "'adipel", 'sakaciulah', 'papalusiyang', 'mintang', 'hayda', "nipasifana'an", 'palasawaden', 'piasip', 'nipikurac', 'sapilitmuhaw', 'pasuwasuwalen', 'ngitangiten', 'mihaydaay', 'sakatilu', 'umah', "mingitu'ay", 'ciun', "ka'amis", 'pelpelen', 'sapitdal', 'pakapaculi', 'akaykus', 'ccay', 'halupatay', 'stifanuan', "mapa'icel", 'mitapalay', "sapipa'uripaw", 'puntus', 'pisurar', 'hacul', 'kaludateng', 'mihufucay', 'tatudungen', 'macilas', "mamata'mud", 'malacaay', 'satahaf', 'mikarkar', 'matanengaytu', 'nikalumuwadan', "mapakafana'", '300', 'maluwama', 'sakalcad', 'timutiaw', "nipisanga'", 'pararaw', 'kadkaden', 'pakilacen', 'm

In [None]:
# Extend the dictionaries that map each word to an ID, and vice versa
# These should include the words in the original dictionaries (from NTU) plus the words in the bible
# We can use this to go back and forth between IDs and characters
all_word_to_idx = word_to_idx
all_idx_to_word = idx_to_word
offset = len(unique_word)
idx = 0

for word in bible_unique_word:
  if word not in unique_word:
    idx += 1
    all_word_to_idx[word] = int(idx) + offset
    all_idx_to_word[int(idx) + offset] = word


# Check to make sure it worked
for i in range(1,6):
  print(i, idx_to_word[i], word_to_idx[idx_to_word[i]])
  print(i, all_idx_to_word[i], all_word_to_idx[all_idx_to_word[i]])


for i in range(offset-6,offset):
  print(i, idx_to_word[i], word_to_idx[idx_to_word[i]])
  print(i, all_idx_to_word[i], all_word_to_idx[idx_to_word[i]])


for i in range(5000,5006):
  print(i, all_idx_to_word[i], all_word_to_idx[all_idx_to_word[i]])

1 itiratu 1
1 itiratu 1
2 mialaay 2
2 mialaay 2
3 luma' 3
3 luma' 3
4 kunu 4
4 kunu 4
5 celi 5
5 celi 5
1449 mipadangtu 1449
1449 mipadangtu 1449
1450 mita'elif 1450
1450 mita'elif 1450
1451 fali 1451
1451 fali 1451
1452 cifahinay 1452
1452 cifahinay 1452
1453 kangaay 1453
1453 kangaay 1453
1454 'min 1454
1454 'min 1454
5000 cikaliwates 5000
5001 talifahal 5001
5002 mipatahka 5002
5003 patnak 5003
5004 sapata^ngad 5004
5005 hiruti 5005


In [None]:
# Now use the ditionaries we created to create feature vectors
# We want the feature vectors to look like this:
# [ID of word, ID of word before, ID of word two before, ID of word after, ID of word two after]

# We want a list of lists of vectors with the following format:
# all_vecs = [file_vecs for first file, file_vecs for second file, ...]
# file_vecs = [sent_vecs for first sentence, sent_vecs for second file, ...]
# sent_vecs = [vec for first word, vec for second word, ...]
# vec = [ID of word, ID of word before, ID of word two before, ID of word after, ID of word two after]

# We also want a list of whether or not each word is in our list of verbs (1 if verb, 0 if not)

bible_all_vecs = []

# For each file in the corpus
for word_sublist in bible_word_list:
  bible_file_vecs = []
  bible_file_is_verb = []
  # For each sentence in the file
  for sentence in word_sublist:
    bible_sent_vecs = []

    # For each word in the sentence
    for i in range(len(sentence)):
      vec = []

    # ID of word
      vec.append(all_word_to_idx[sentence[i]])

    # ID of word before
      if i > 0:
        vec.append(all_word_to_idx[sentence[i-1]])
      else:
        vec.append(0)

    # ID of word two before
      if i > 1:
        vec.append(all_word_to_idx[sentence[i-2]])
      else:
        vec.append(0)

    # ID of word after
      if i < len(sentence) - 1:
        vec.append(all_word_to_idx[sentence[i+1]])
      else:
        vec.append(0)

    # ID of word two after
      if i < len(sentence) - 2:
        vec.append(all_word_to_idx[sentence[i+2]])
      else:
        vec.append(0)

    # Append to list of vectors in sentence
      bible_sent_vecs.append(vec)

    # Append to list of sentences in file
    bible_file_vecs.append(bible_sent_vecs)

  # Append to big list of vectors
  bible_all_vecs.append(bible_file_vecs)

In [None]:
# Check the feature vectors and verb list
# Print vectors for words in first file
print(bible_all_vecs[0])
print(len(bible_all_vecs))

# Print vectors for words in first sentence in first file
print(bible_all_vecs[0][0])
print(len(bible_all_vecs[0][0]))

[[[2572, 0, 0, 1114, 6763], [1114, 2572, 0, 6763, 1005], [6763, 1114, 2572, 1005, 7560], [1005, 6763, 1114, 7560, 849], [7560, 1005, 6763, 849, 6887], [849, 7560, 1005, 6887, 1176], [6887, 849, 7560, 1176, 5201], [1176, 6887, 849, 5201, 849], [5201, 1176, 6887, 849, 8894], [849, 5201, 1176, 8894, 1176], [8894, 849, 5201, 1176, 5201], [1176, 8894, 849, 5201, 849], [5201, 1176, 8894, 849, 1699], [849, 5201, 1176, 1699, 1176], [1699, 849, 5201, 1176, 6763], [1176, 1699, 849, 6763, 1005], [6763, 1176, 1699, 1005, 8213], [1005, 6763, 1176, 8213, 849], [8213, 1005, 6763, 849, 6887], [849, 8213, 1005, 6887, 6914], [6887, 849, 8213, 6914, 0], [6914, 6887, 849, 0, 0]], [[2572, 0, 0, 1114, 6763], [1114, 2572, 0, 6763, 1005], [6763, 1114, 2572, 1005, 7560], [1005, 6763, 1114, 7560, 849], [7560, 1005, 6763, 849, 6887], [849, 7560, 1005, 6887, 1176], [6887, 849, 7560, 1176, 5201], [1176, 6887, 849, 5201, 849], [5201, 1176, 6887, 849, 8894], [849, 5201, 1176, 8894, 1176], [8894, 849, 5201, 1176, 520

In [None]:
# Initialize array
bible_np_vecs=numpy.array([[0,0,0,0,0]])

# Flatten all_vecs list
# For file in all_vecs
for i in range(len(bible_all_vecs)):
  # For sentence in file
  for j in range(len(bible_all_vecs[i])):
    # For word in sentence
    for vec in bible_all_vecs[i][j]:
      bible_np_vecs=numpy.append(bible_np_vecs,[vec],axis=0)

# Remove first element (all zeroes)
bible_np_vecs=bible_np_vecs[1:]
print(bible_np_vecs)
X=bible_np_vecs

[[2572    0    0 1114 6763]
 [1114 2572    0 6763 1005]
 [6763 1114 2572 1005 7560]
 ...
 [5444  847  801  618 4649]
 [ 618 5444  847 4649    0]
 [4649  618 5444    0    0]]


In [None]:
# The gradient boosting model had the highest accuracy on the NTU test data
# We will use this model to predict verbs in the Bible data
gb_bible_pred = gb_classifier.predict(X)

In [None]:
# Check to make sure it's not all zeros
print(gb_bible_pred[:1000])

[1 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 1
 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 1
 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0
 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0
 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0
 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0
 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0
 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1
 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0
 0 1 0 0 0 0 0 1 0 0 0 0 

In [None]:
# Get a list of potential verbs in the Bible from the classifier's output
bible_verbs = []

for i in range(len(gb_bible_pred)):
  if gb_bible_pred[i]:
    verb = all_idx_to_word[X[i][0]]
    bible_verbs.append(verb)

In [None]:
print(len(bible_verbs))

94214


In [None]:
# Write the list of potential verbs in the Bible to a text file
with open("/content/drive/MyDrive/Part2/Verbs/verbs_Bible.txt", "w") as f:
  f.write(",".join(bible_verbs))