### Load training dataset(s)

In [1]:
import os
import csv
import emoji
import string
import random
import json
import pickle
import xml.etree.ElementTree as et
from collections import defaultdict
from tokenize_GSW import tokenize
from normalizerFunctions import Training_Corpus


In [2]:
# get ArchiMob data 
def parse_ArchiMob(folder_path):
    # returns a list of (word, norm) pairs
    utterances = []
    for xml_file_path in os.listdir(folder_path):
        if xml_file_path == "Metadata.txt" or xml_file_path == "person_file.xml":
            continue
        tree = et.parse(folder_path+xml_file_path)
        root = tree.getroot()
        for u_element in root.iter('{http://www.tei-c.org/ns/1.0}u'):
            utterance = []
            for w_element in u_element.iter('{http://www.tei-c.org/ns/1.0}w'):
                word = w_element.text
                if word:
                    if word[-3:] == "***":            # ignores individuals' names hidden with asterisks in the ArchiMob corpus
                        continue
                    norm = w_element.get('normalised')
                    utterance.append((word, norm))
            utterances.append(utterance)
    return utterances
    
# read WUS dataset and update the normalization dictionary  
def parse_WUS(folder_path):
    # returns a list of (word, norm) pairs
    word_norm_pairs = []
    for filename in os.listdir(folder_path):
        with open(os.path.join(folder_path, filename), "r", encoding="utf-8") as file:
            reader = csv.reader(file, delimiter="\t")
            i = 0 
            sent = []
            for row in reader:
                word= row[0]
                try:
                    norm = row[1]
                except IndexError:
                    continue
                if emoji.is_emoji(word[0]):
                    continue
                word = word.strip(string.punctuation)
                norm = norm.strip(string.punctuation)
                if word:
                    word = word.lstrip()
                    if " " in word:     # handles many-to-one normalizations
                        word = word.replace(" ","-") 
                    sent.append((word, norm)) 
                else: 
                    word_norm_pairs.append(sent) 
                    sent = []   
    return word_norm_pairs

# expand the corpus with additional normalizations from the Bilingual Lexicon
def parse_Bilexicon(file_path):
    # returns a list of (word, norm) pairs
    with open(file_path, 'r') as file:
        csv_reader = csv.reader(file)
        norms = {}
        next(csv_reader)  
        for row in csv_reader:
            key = row[1] 
            key = key.strip(string.punctuation)
            value = row[2] 
            value = value.strip(string.punctuation)
            norms[key] = value
        return [norms.items()]

In [3]:
def most_frequent_replacement(dictionary, word):
    max_value = max(dictionary.values())  
    max_value_keys = [k for k, v in dictionary.items() if v == max_value]
    return random.choice(max_value_keys)

def leave_as_is(dictionary, word):
    return word


def get_norms_and_error_rate(list_of_utterances, counts, method):
    normed_utterances = []
    total = 0 
    hits = 0
    words_unnormed = 0
    for u in list_of_utterances:
        normed_utterance = []
        for (word, norm) in u:
            if word == norm:
                words_unnormed += 1
            prediction = method(counts[word], word) #TODO: only picks a random norm when max freq is tied
            normed_utterance.append(prediction)
            if prediction == norm:
                hits += 1
            total += 1
        normed_utterances.append(normed_utterance)
    accuracy = 100*hits/total
    unnormed = 100*words_unnormed/total
    print(accuracy)
    print(unnormed)
    #Rob van der Goot. 2019b. Normalization and Parsing Algorithms for Uncertain Input. Ph.D. thesis, Uni- versity of Groningen.
    Err_Red_rate = (accuracy - unnormed)/(100 - unnormed) # all are percentages
    return normed_utterances, Err_Red_rate

In [5]:
archimob = 'Archimob_Release_2/'
wus = "WUS/"
bilexicon = 'bilexicon.csv'

In [6]:
c_wus = Training_Corpus(wus, parse_WUS)
c_archimob = Training_Corpus(archimob, parse_ArchiMob)
c_bilexicon = Training_Corpus(bilexicon, parse_Bilexicon)

In [7]:
corpora = [c_wus,c_archimob, c_bilexicon]
for c in corpora:
    print("total tokens: ",sum([sum(count.values()) for _, count in c.norm_dict.items()]))
    print("total unique wordforms: ",len(c.norm_dict))
    labels = set([norm for _,norm_dict in c.norm_dict.items() for norm in norm_dict.keys()])
    print("total normalized forms: ", len(labels))
    print(" ")

total tokens:  93674
total unique wordforms:  9052
total normalized forms:  6907
 
total tokens:  581466
total unique wordforms:  48539
total normalized forms:  31696
 
total tokens:  137051
total unique wordforms:  137051
total normalized forms:  94896
 


In [8]:
# define functions to calculate the error reduction rate of a normalization method 

def get_key_with_highest_value(dictionary):
    max_value = max(dictionary.values())  
    max_value_keys = [k for k, v in dictionary.items() if v == max_value]
    return random.choice(max_value_keys)  # when max freq is tied, picks one of the best at random 

def get_norms_and_error_rate(list_of_utterances, counts, method):
    normed_utterances = []
    total = 0 
    hits = 0
    words_unnormed = 0
    for u in list_of_utterances:
        normed_utterance = []
        for (word, norm) in u:
            if word == norm:
                words_unnormed += 1
            prediction = method(counts[word])
            normed_utterance.append(prediction)
            if prediction == norm:
                hits += 1
            total += 1
        normed_utterances.append(normed_utterance)
    accuracy = 100*hits/total
    unnormed = 100*words_unnormed/total
    print('MFR Corpus accuracy: \t',accuracy)
    print('MFR Proportion left: \t',unnormed)
    #Rob van der Goot. 2019b. Normalization and Parsing Algorithms for Uncertain Input. Ph.D. thesis, Uni- versity of Groningen.
    Err_Red_rate = (accuracy - unnormed)/(100 - unnormed) # all are percentages
    return normed_utterances, Err_Red_rate

In [9]:
archiMob_MFR_norms, archiMob_ERR_MFR = get_norms_and_error_rate(c_archimob.word_norm_pairs, c_archimob.norm_dict, get_key_with_highest_value)
print('ArchiMob ERR with MFR: \t',archiMob_ERR_MFR)

MFR Corpus accuracy: 	 91.54413155713318
MFR Proportion left: 	 24.67590538397774
ArchiMob ERR with MFR: 	 0.8877401914225178


In [10]:
wus_MFR_norms, wus_ERR_MFR = get_norms_and_error_rate(c_wus.word_norm_pairs, c_wus.norm_dict, get_key_with_highest_value)
print('WUS ERR with MFR: \t',wus_ERR_MFR)

MFR Corpus accuracy: 	 86.1274206289899
MFR Proportion left: 	 35.17304695006085
WUS ERR with MFR: 	 0.7860059941376016


In [11]:
def join_corpora(list_of_corpora):
    joined_corpus = []
    for corpus in list_of_corpora:
        if len(corpus.word_norm_pairs) ==1:     # handles the bilexicon
            for pair in list(corpus.word_norm_pairs)[0]:
                if pair:
                    joined_corpus.extend([[pair]])
        else:
            joined_corpus.extend(corpus.word_norm_pairs)
    return joined_corpus

In [12]:
norm_corpus = Training_Corpus(corpora,join_corpora)
print("total tokens: ",sum([sum(count.values()) for _, count in norm_corpus.norm_dict.items()]))
print("total unique wordforms: ",len(norm_corpus.norm_dict))
labels = set([norm for _,norm_dict in norm_corpus.norm_dict.items() for norm in norm_dict.keys()])
print("total normalized forms: ", len(labels))

total tokens:  812191
total unique wordforms:  181744
total normalized forms:  121603


In [13]:
total_MFR_norms, total_ERR_MFR = get_norms_and_error_rate(norm_corpus.word_norm_pairs, norm_corpus.norm_dict, get_key_with_highest_value)
print('Joined ERR with MFR: \t',total_ERR_MFR)

MFR Corpus accuracy: 	 90.65478440416109
MFR Proportion left: 	 21.726785940745465
Joined ERR with MFR: 	 0.8806077441924849


In [14]:
for corpus, filename in [(c_wus,"pickled_wus.pkl"),
                         (c_archimob,"pickled_archimob.pkl"), 
                         (norm_corpus,'pickled_train_corpus.pkl')]:
    with open(filename, "wb") as file:
        pickle.dump(corpus, file)