This is a rule-based stemmer. The idea is I get some rules from a grammar book and then add them in here.
First, let's get the data from FB.

In [7]:
import os
import xml.etree.ElementTree as ET

corpora_dir = "../FormosanBank/Corpora" # this should be the relative path to your FormosanBank download
FIND_LANG = "pyu" # Puyuma
FIND_GLOTTO = "nanw1237" # nanwang
FIND_DIALECT = "Nanwang" # Nanwang

In [8]:
def get_all_xmls():
# gets all .xml files in corpora_dir.
    all_xmls = []
    for root, dirname, filenames in os.walk(corpora_dir):
        for f in filenames:
            if f.endswith("xml"):
                all_xmls.append(os.path.join(root,f))
    return all_xmls

In [9]:
def get_lang_xmls(file_list, match_lang=FIND_LANG, match_glotto=FIND_GLOTTO, match_dialect=FIND_DIALECT) -> list[str]:
# takes in a list of xml files and finds which ones match our desired language code(s).
    lang_xmls = []
    print(f"Finding xml files with language code {match_lang}, glotto code {match_glotto}, dialect {match_dialect}")
    for filepath in file_list:
        tree = ET.parse(filepath)
        root = tree.getroot()
        if root == None:
            print(f"Unable to parse file: {filepath}")
        # taken from formosanbank validate_xml.py
        lang = root.get("{http://www.w3.org/XML/1998/namespace}lang")
        if not lang:
            # print(f"{filepath} doesn't appear to have a [lang] attrib: {root.attrib}")
            continue
        glottocode = root.get("glottocode")
        dialect = root.get("dialect")
        if lang.lower() == match_lang.lower():
            if not glottocode and not dialect: # If no glotto or dialect, but language matches, add it
                # print(f"glotto: {glottocode} | dialect: {dialect} | file: {' '.join(filepath.split('/')[-5:])}")
                # we assume that just the language code is enough
                lang_xmls.append(filepath)
            else:
                # If glottocode or dialect match, add it
                if (glottocode and match_glotto and glottocode.lower() == match_glotto.lower()) or (dialect and match_dialect and dialect.lower() == match_dialect.lower()):
                        lang_xmls.append(filepath)
    # print(f"Found language codes: {str(list(set(found_langs)))}")
    # print(f"Found dialects of {match_lang}: {str(list(set(found_dialects)))}")
    print(f"Found {len(lang_xmls)} matching xml files")
    if len(lang_xmls) < 6:
        for x in lang_xmls:
            print('\t '.join(x.split('/')[3:]))
    return lang_xmls

In [10]:
def get_sent_list(root) -> list[str]:
# takes in an xml root, finds all 'sentence' elements, and returns a list of the 'form (standard)' element's text
    sents = root.findall(".//S")
    texts = []
    for s in sents:
        form_children = []
        for child in s:
            if child.tag == "FORM":
                form_children.append(child)
            # there is 'standard' and 'original' forms
            if len(form_children) == 1:
                texts.append(form_children[0].text)
            else:
                for child in form_children:
                    kind = child.get("kindOf")
                    if kind == "standard":
                        texts.append(child.text)
    return texts

In [11]:
all_xmls = get_all_xmls()
print(len(all_xmls))

17108


In [12]:
lang_xmls = get_lang_xmls(all_xmls)

Finding xml files with language code pyu, glotto code nanw1237, dialect Nanwang
Found 18 matching xml files


In [13]:
sent_list = []
for x in lang_xmls:
    root = ET.parse(x).getroot()
    x_list = get_sent_list(root)
    sent_list += x_list
print(len(sent_list))
# print(len(list(Set(sent_list))))
# print(len(sent_list.Set()))
print(len(set(sent_list)))

84351
27078


The difference in the pure list of sentences and the 'set' of sentences is likely due to many dictionary definitions (i.e., single-word sentences) being included across various dictionaries. Also, as mentioned in another notebook, ILRDF has some sentences repeated across different learning units.

In [34]:
corpus = []
bad_sents = []
for s in sent_list:
    if not s:
        bad_sents.append(s)
        continue
    words = s.split()
    for w in words:
        w_clean = w.strip(' ,.!"`~![](){}|/\\<>#$@%^&*_-=+').lower()
        if w_clean != '':
            corpus.append(w_clean)
print(len(corpus))
print(f"Found {str(len(bad_sents))} non-sentences (blank) out of {str(len(sent_list))}")

623347
Found 48 non-sentences (blank) out of 84351


In [35]:
all_words = set(corpus) # our 'dict' of words we've seen
print(len(all_words))

18622


Up until now our methodology has simply been 'reading in the corpus', which isn't very exciting, and is the exact same for any of the language tools we want to make. Next, we will load in the `rules.json` file for stemming rules.

In [36]:
import json
rules_file = "rules.json"

with open(rules_file, 'r') as f:
    rule_data = json.load(f)

for top_level in rule_data:
    print(top_level)

simple affix
compound affix
reduplication


At the highest level, we have simple affixes, compound affixes, and reduplication. Thankfully these are well-organized, so they're not too hard to parse. Let's start with the simple affixes.

In [37]:
for simp_rule in rule_data['simple affix']:
    print(simp_rule)

infix
prefix
suffix


In [38]:
infix_orths = []
for infix_rule in rule_data['simple affix']['infix']:
    infix_orths.append(infix_rule['orthography'].strip('-'))
print(infix_orths)

['in', 'em', 'en', 'um', 'un', 'im']


If need be we can print out the 'rule' part of the infix_rule object, but it looks like we have a series of infixes, and they all occur after the first consonant. Let's write a simple program to check through our 'dictionary' `all_words` and find any words that fit this pattern.

In [39]:
consonant_list = 'bcdfghjklmnpqrstvwxz?'
candidate_words = []
for w in all_words:
    if (w[0] in consonant_list and w[1:3] in infix_orths):
        # print(w)
        candidate_words.append(w)
print(len(candidate_words))

2242


In [29]:
candidate_stem_dict = {}
for cand in candidate_words:
    stem = cand[0] + cand[3:]
    if stem in all_words:
        if stem not in candidate_stem_dict:
            candidate_stem_dict[stem] = []
        candidate_stem_dict[stem].append(cand)
print(len(candidate_stem_dict))

674


In [49]:
i = 0
for stem in candidate_stem_dict:
    print(f"Candidate words for stem: \n\t{stem}")
    print(f"\t{candidate_stem_dict[stem]}")
    print("")
    i += 1
    if i == 5:
        break

Candidate words for stem: 
	kakeris
	['kemakeris']

Candidate words for stem: 
	ketket
	['kemetket', 'kinetket']

Candidate words for stem: 
	mao
	['menao']

Candidate words for stem: 
	patedrelr
	['pinatedrelr', 'penatedrelr']

Candidate words for stem: 
	balruk
	['benalruk']

