In [1]:
import re
from collections import defaultdict, Counter
import regex as re
from itertools import chain
import pycrfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from tokenizers import normalizers
from tokenizers.normalizers import NFKD, StripAccents
from transformers import AutoTokenizer



In [2]:
# def accent_normalize(string):
#     for char in accents_dict:
#         string = re.sub(char, accents_dict[char],string)
#     return string

# normalizer = normalizers.Sequence([NFKD(),StripAccents()])
# normalize = lambda x: re.sub("«|»",'"',normalizer.normalize_str(x))


# gloss_pipeline = lambda gloss: tok(tag_gloss(gloss.split("-")))
# morph_pipeline = lambda morph: list(morph)
# word_pipeline = lambda words: list(words.lower())


In [95]:

s="win-na k'uxw gya'a-s Man Jacob Brown ii n hox̲ gidax̲-t nee-m da'ak̲hlxw dim-ma mehl-i-hl wila wil-hl betl'a betl' loo-'m-aa"
def map_accents(gits_sent):
    accent_map = {
    "x̲" : "χ",
    "g̲" : "ɢ",
    "k̲" : "q",    
    }
    
    for i in accent_map:
        gits_sent = re.sub(i, accent_map[i], gits_sent) 
    return gits_sent

In [149]:
# BIES tags
BEGIN = "B"
INSIDE = "I"
END = "E" 
SINGLE = "S"

## Language to code, for reading in data
langs_codes = {
        "Gitksan" : "git", 
        "Arapaho" : "arp", 
        "Lezgi" : "lez", 
        "Nyangbo" : "nyb", 
        "Tsez" : "ddo",
        "Uspanteko" : "usp"
    }  

## helper function for BIES tagging ####
def get_all_tags(segments):
    '''Get BIE tags for list of segments'''
    
    def get_tags(morph):
        fidx = len(morph) - 1
        bidx = 0
        tagged_morph = []

        for idx, char in enumerate(morph):
            if fidx == bidx:
                tag = SINGLE
            elif idx == bidx:
                tag = BEGIN
            elif idx == fidx:
                tag = END
            else:
                tag = INSIDE

            tagged_morph.append((char, tag))

        return tagged_morph

    all_tagged = []
    for morpheme in segments:
        tagged_morph = get_tags(morpheme)
        for char, tag in tagged_morph:
            all_tagged.append([char, tag])
    return all_tagged

def stem2tag(gloss):
    for i, c in enumerate(gloss):
        try:
            if not c.isupper() and not c[0].isdigit():
                gloss[i] = "stem"
        except IndexError:
            continue
    return gloss

### LangIGT class: initialize with language name ###

class LangIGT:
    
    def __init__(self, language, track = "two"):
        self.language = language
        self.langdict = self.get_langdict()
        
        self.train_tagged = self.get_biesgt("train")
        self.dev_tagged = self.get_biesgt("dev")
        
        self.lemma2gloss = self.get_lemma2gloss()
        
        
    def get_langdict(self):
        
        ############ Preprocessing - needs language-specific finetuning!!!##########
        
        normalizer = normalizers.Sequence([NFKD(), StripAccents()])
        normalize = lambda x: re.sub("«|»",'"',normalizer.normalize_str(x))
        ld = {
            "train": defaultdict(list),
            "dev": defaultdict(list)
        }

        path = f"data/{self.language}/"
        train_fn = f"{langs_codes[self.language]}-train-track2-uncovered"
        dev_fn = f"{langs_codes[self.language]}-dev-track2-uncovered"

        for fp in [train_fn, dev_fn]:
            data_type = fp.split("-")[1]
            with open(path + fp, "r") as f:
                for line in f.readlines():


                    if line.startswith("\\t"):
                        ld[data_type]["transcription"].append(normalize(line.lstrip("\\t ").rstrip("\n")))
                    if line.startswith("\g"):
                        ld[data_type]["gloss"].append(normalize(line.lstrip("\\g ").rstrip("\n")))
                    if line.startswith("\m"):
                        ld[data_type]["morphemes"].append(normalize(map_accents((line.lstrip("\\m ").rstrip("\n")))))
                    if line.startswith("\l"):
                        ld[data_type]["translation"].append(normalize(line.lstrip("\\l ").rstrip("\n")))
                    if line.startswith("\p"):
                        ld[data_type]["pos"].append(normalize(line.lstrip("\\p ").rstrip("\n")))

        return ld
    
    def get_morphsegs(self,split):
        morphsegs = []
        
        morphsplits=[[i for i in x.split()] for x in self.langdict[split]["morphemes"]]
        glosssplits =[[stem2tag(i.split("-")) for i in x.split()] for x in self.langdict[split]["gloss"]]
        

        for m,g in zip(morphsplits,glosssplits):
            for n, _ in enumerate(m):

                morphsegs.append((m[n].split("-"), g[n]))
        return morphsegs
    
    def get_biesgt(self, split):
        
        morphsegs = self.get_morphsegs(split)
        
        bios=[]
        sid=0

        for i in morphsegs:
            word_bios = []
            ms, gt = i

            charbies = get_all_tags(ms)

            for i in range(len(ms)):
                try:

                    msl = len(ms[i])
                    gtms = gt[i]
                except IndexError:
                    continue

                #bies_and_glosstag = [c + [gtms] for c in charbies[sid:sid+msl]]
                for c in charbies[sid:sid+msl]:
                    word_bios.append(c + [gtms])
                sid+= msl
            bios.append(word_bios)

            sid=0
        return bios
    
    def biesgt_formatted(self,split):
        biotags = self.get_biesgt(split)
        biolabelled = []
        for i,_ in enumerate(biotags):
            word_biolabelled = []
            for bt in biotags[i]:
                tagged = [bt[0], bt[1] + "-" + bt[2]]
                word_biolabelled.append(tagged)
            biolabelled.append(word_biolabelled)
        return biolabelled
    
    def get_lemma2gloss(self):
        lemma2gloss = defaultdict(Counter)

        for mline, gline in zip(self.langdict["train"]["morphemes"],self.langdict["train"]["gloss"]):
            for msplit, gsplit in zip(mline.split(), gline.split()):
                for mdsplit, gdsplit in zip(msplit.split("-"),gsplit.split("-")):
                    if not gdsplit.isupper():
                        lemma2gloss[mdsplit].update([gdsplit])
        
        lemma2gloss["unk"] = "<unk>" 
        return lemma2gloss
    
    

In [280]:
## Initialize, get data in correct format for CRF ###
## Language: Lezgi

usp = LangIGT("Uspanteko")
# train_tagged = usp.train_tagged
# dev_tagged = usp.dev_tagged


In [281]:

def get_morphsegs(lang,split):
    morphsegs = []

    morphsplits=[[i for i in x.split()] for x in lang.langdict[split]["morphemes"]]
    glosssplits =[[stem2tag(i.split("-")) for i in x.split()] for x in lang.langdict[split]["gloss"]]
    posssplits =[[i.split("-") for i in x.split()] for x in lang.langdict[split]["pos"]]

    for m,g,p in zip(morphsplits,glosssplits,posssplits):
        #print((m,g,p))
        for n, _ in enumerate(m):
            morphsegs.append((m[n].split("-"), g[n], p[n]))
    
    return morphsegs


In [333]:
def get_biesgt(lang, split):

    morphsegs = get_morphsegs(lang,split)

    bios=[]
    sid=0

    for i in morphsegs:
        word_bios = []
        ms, gt, pos = i

        charbies = get_all_tags(ms)

        for i in range(len(ms)):
            try:

                msl = len(ms[i])
                gtms = gt[i]
                poss = pos[i]
            except IndexError:
                continue

            #bies_and_glosstag = [c + [gtms] for c in charbies[sid:sid+msl]]
            for c in charbies[sid:sid+msl]:
                if c == "???":
                    continue
                else:
                    word_bios.append(c + [poss] + [gtms])
            sid+= msl
        bios.append(word_bios)

        sid=0
    return bios


In [334]:
train_tagged = get_biesgt(usp,"train")
dev_tagged = get_biesgt(usp,"dev")

In [336]:
### Feature extraction for CRF ####
BOUNDARY = "<BD>"

def char2featuresusp(example, i):
    return [
        f"char_at_{i} = {example[i][0]}",
        f"bies_at_{i} = {example[i][1]}",
        f"pos_at_{i} = {example[i][2]}",

        f"substring_left={''.join(list(zip(*example[0:i]))[0]) if i > 0 else BOUNDARY}",
        f"substring_right={ ''.join(list(zip(*example[i+1:len(example)]))[0]) if i+1 < len(example) else BOUNDARY}",
        
        f"trigram_right={''.join(list(zip(*example[i:i+3]))[0]) if i+2 < len(example) else BOUNDARY}",
        f"trigram_left={''.join(list(zip(*example[i-2:i+1]))[0]) if i >=2  else BOUNDARY}",
        
        f"bigram_right={''.join(list(zip(*example[i:i+2]))[0]) if i+1 < len(example) else BOUNDARY}",
        f"bigram_left={''.join(list(zip(*example[i-1:i+1]))[0]) if i >0  else BOUNDARY}",
        
        f"right_dist={len(example)-1-i}",
        f"left_dist={i}",
        
        f"is_alpha={example[i][0].isalpha()}",
        f"left_char={example[i-1][0] if i-1 >= 0 else BOUNDARY}",
        f"left_pos={example[i-1][2] if i-1 >= 0 else BOUNDARY}",
        
        f"right_char={example[i+1][0] if i+1 < len(example) else BOUNDARY}",
        f"right_pos={example[i+1][2] if i+1 < len(example) else BOUNDARY}",
        ]

def data2featuresusp(data):
    """ Extract features for a data set in BIES format. """
    return [[char2featuresusp(example,i) for i in range(len(example))] for example in data]

def data2labelsusp(data):
    """ Extract the tags from a data set in BIES format. """
    return [[tok[3] for tok in example] for example in data]

# # Initialize the training, development and test sets for pycrfsuite.
X_train = data2featuresusp(train_tagged)
y_train = data2labelsusp(train_tagged)

X_dev = data2featuresusp(dev_tagged)
y_dev = data2labelsusp(dev_tagged)

In [199]:
# ### Feature extraction for CRF ####
# BOUNDARY = "<BD>"

# def char2features(example, i):
#     return [
#         f"char_at_{i} = {example[i][0]}",
#         f"bies_at_{i} = {example[i][1]}",
#         f"substring_left={''.join(list(zip(*example[0:i]))[0]) if i > 0 else BOUNDARY}",
#         f"substring_right={ ''.join(list(zip(*example[i+1:len(example)]))[0]) if i+1 < len(example) else BOUNDARY}",
        
#         f"trigram_right={''.join(list(zip(*example[i:i+3]))[0]) if i+2 < len(example) else BOUNDARY}",
#         f"trigram_left={''.join(list(zip(*example[i-2:i+1]))[0]) if i >=2  else BOUNDARY}",
        
#         f"bigram_right={''.join(list(zip(*example[i:i+2]))[0]) if i+1 < len(example) else BOUNDARY}",
#         f"bigram_left={''.join(list(zip(*example[i-1:i+1]))[0]) if i >0  else BOUNDARY}",
        
#         f"right_dist={len(example)-1-i}",
#         f"left_dist={i}",
        
#         f"is_alpha={example[i][0].isalpha()}",
#         f"left_char={example[i-1][0] if i-1 >= 0 else BOUNDARY}",
        
#         f"right_char={example[i+1][0] if i+1 < len(example) else BOUNDARY}",
#         ]

# def data2features(data):
#     """ Extract features for a data set in BIES format. """
#     return [[char2features(example,i) for i in range(len(example))] for example in data]

# def data2labels(data):
#     """ Extract the tags from a data set in BIES format. """
#     return [[tok[2] for tok in example] for example in data]

# # # Initialize the training, development and test sets for pycrfsuite.
# X_train = data2features(train_tagged)
# y_train = data2labels(train_tagged)

# X_dev = data2features(dev_tagged)
# y_dev = data2labels(dev_tagged)

In [339]:
trainer = pycrfsuite.Trainer(verbose=True)

# your code here
for s_X, s_y in zip(X_train, y_train): 
    try:
        trainer.append(s_X, s_y)
    except UnicodeError:
        continue

trainer.set_params({
    "c1": 0.1,
    "c2": 0.1,
    "max_iterations": 150,
    "feature.possible_transitions": True
})

trainer.train("uspanteko_v3.pycrfsuite")

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 61787
Seconds required: 0.532

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 150
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 564501.177432
Feature norm: 1.000000
Error norm: 184189.172428
Active features: 61524
Line search trials: 1
Line search step: 0.000005
Seconds required for this iteration: 2.545

***** Iteration #2 *****
Loss: 517543.533315
Feature norm: 5.470086
Error norm: 131203.502824
Active features: 61326
Line search trials: 3
Line search step: 0.250000
Seconds required for this iteration: 3.628

***** Iteration #3 *****
Loss: 354735.246100
Feature norm: 4.333585
Error norm: 112266.371681
Active features: 59624
Line search trials: 1
Line search step: 1.000000
Seconds requir

In [340]:
glosser = pycrfsuite.Tagger()
glosser.open('uspanteko_v3.pycrfsuite')



<contextlib.closing at 0x130547070>

In [192]:
#[m.split() for m in lez.langdict["dev"]["morphemes"][0]]

In [208]:
#dev_tagged

In [341]:
usp.dev_tagged = dev_tagged

In [313]:
def get_glosspreds_sent(glosser, lang):
    def get_predicted_glosstags(glosser, lang):
        dev_data = lang.dev_tagged
        glossed_data = []
        for i, ex in enumerate(data2features(dev_data)):
            chars = [x[0] for x in dev_data[i]]
            bies = [x[1] for x in dev_data[i]]
            glosses = glosser.tag(ex)
            glossed_data.append([chars,bies,glosses])
        return glossed_data
    
    glossed_data = get_predicted_glosstags(glosser,lang)
    
    sid = 0
    glossed_sents = []
    dev_sent_splits = [m.split() for m in lang.langdict["dev"]["morphemes"]]
    
    for sent in dev_sent_splits:
        sent_len = len(sent)
        glossed_sents.append(glossed_data[sid:sid+sent_len])
        sid += sent_len
    
    return glossed_sents


In [349]:
def get_gloss_chunks(tagged_sent):
    glosschunks = []
    for morph in tagged_sent:
        char, bies, gloss = morph
        morphchunks=[]

        for b, g in zip(bies, gloss):
            if b in {"B","S"}:
                morphchunks.append(g)
                
        glosschunks.append(morphchunks)

    return glosschunks

def convert_to_igt(glosschunk, sentsplit,lang):    
    for gloss, morph in zip(glosschunk, sentsplit):

        for i, _ in enumerate(gloss):

            if gloss[i] == "stem":
                try:
                    gloss[i] = lang.lemma2gloss[morph[i]].most_common()[0][0]
                    
                except IndexError:
                    gloss[i] = lang.lemma2gloss["unk"]
    return " ".join(["-".join(m) for m in glosschunk])
    
def get_igt_preds(glosser,lang):
    langdict = lang.langdict
    sentsplits = [[n.split("-") for n in s.split()] for s in lang.langdict["dev"]["morphemes"]]
    glosspreds = get_glosspreds_sent(glosser, lang)
    glosschunks = [get_gloss_chunks(s) for s in glosspreds]
    igt_preds= [convert_to_igt(glosschunks[i], sentsplits[i],lang) for i,_ in enumerate(glosschunks)]
    
    return igt_preds

In [350]:
l2g["el"].most_common()

[('salir', 133), ('???', 20), ('el', 8), ('sali', 1)]

In [351]:
preds=get_igt_preds(glosser,usp)

In [352]:
def write_preds(preds, lang, model_v):
    with open(f"predictions/{lang}_{model_v}", "w", encoding = "utf-8") as fout:
        fout.write("\n".join(preds))
        

In [322]:
write_preds(usp_igt_preds, "Uspanteko", "1")

In [353]:
preds

['el/ella INC-E3-hacer volver animal',
 'CONJ despues INC-camino INC-camino ??? <unk>',
 '???-???-traer-??? el/ella sin perro sin',
 'solo ???-???-recibir decir',
 'no ???-???-recibir decir ni <unk> nada',
 'cuando ???-llegar <unk> el/ella INC-E3-hacer volver uno meter',
 'dice',
 'CONJ cuando ???-venir INC-E3-hacer volver uno meter',
 'despues',
 '???-???-buscar-ir <unk> <unk>',
 'dice',
 'CONJ ???-ir ??? arbol',
 'CONJ INC-ir-que ??? <unk>',
 'CONJ ???-???-encontrar <unk> <unk>',
 '???-cuenta-correr-SC <unk> <unk>',
 'dice',
 'despues ???-???-agarrar',
 'el/ella ???-no uno-??? modo pues',
 'recibir-<unk> decir uno-??? <unk> ???-<unk>',
 'CONJ solo uno <unk> solo <unk> ???-???-comer',
 'dice',
 'como compadre meter solo <unk> ???-???-comer',
 '???-???-comer uno <unk> <unk>',
 'CONJ despues despues ???-cuenta-cargar-??? compadre <unk>',
 '???-llegar',
 'dice',
 'despues ???-llegar',
 'despues INC-E3-hacer trabajo volver pongamos persona',
 '???-???-recibir-??? ???-<unk>',
 'CONJ ???-ll