In [19]:
import re
import os
from collections import defaultdict
import regex as re

from itertools import chain


import pycrfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics


import pandas as pd


import pycrfsuite
from tokenizers import normalizers
from tokenizers.normalizers import NFKD, StripAccents

In [20]:
def accent_normalize(string):
    for char in accents_dict:
        string = re.sub(char, accents_dict[char],string)
    return string

normalizer = normalizers.Sequence([NFKD(),StripAccents()])
normalize = lambda x: re.sub("«|»",'"',normalizer.normalize_str(x))
def stem2tag(gloss):
    for i, c in enumerate(gloss):
        if not c.isupper():
            gloss[i] = "STEM"
    return gloss

In [21]:
langs_codes = {"Gitksan": "git", "lezaho":"arp", "Lezgi":"lez", "Nyangbo":"nyb", "Tsez":"ddo","Uspanteko":"usp"}

BEGIN = "BEGIN"
INSIDE = "INSIDE"
END = "END" 
SINGLE = "SINGLE"

# gloss_pipeline = lambda gloss: tok(tag_gloss(gloss.split("-")))
# morph_pipeline = lambda morph: list(morph)
# word_pipeline = lambda words: list(words.lower())


In [22]:
## read in data
def get_langdict(lang):
    ld = {
        "train": defaultdict(list),
        "dev": defaultdict(list)
    }

    path = f"data/{lang}/"
    train_fn = f"{langs_codes[lang]}-train-track2-uncovered"
    dev_fn = f"{langs_codes[lang]}-dev-track2-uncovered"

    for fp in [train_fn, dev_fn]:
        data_type = fp.split("-")[1]
        with open(path + fp, "r") as f:
            for line in f.readlines():


                if line.startswith("\\t"):
                    ld[data_type]["transcription"].append(normalize(line.lstrip("\\t ").rstrip("\n")))
                if line.startswith("\g"):
                    ld[data_type]["gloss"].append(normalize(line.lstrip("\\g ").rstrip("\n")))
                if line.startswith("\m"):
                    ld[data_type]["morphemes"].append(normalize(line.lstrip("\\m ").rstrip("\n")))

                if line.startswith("\l"):
                    ld[data_type]["translation"].append(normalize(line.lstrip("\\l ").rstrip("\n")))

    return ld["train"],ld["dev"]



In [23]:

def get_all_tags(segments):
    '''Get BIE tags for list of segments'''
    
    def get_tags(morph):
        fidx = len(morph) - 1
        bidx = 0
        tagged_morph = []

        for idx, char in enumerate(morph):
            if fidx == bidx:
                tag = SINGLE
            elif idx == bidx:
                tag = BEGIN
            elif idx == fidx:
                tag = END
            else:
                tag = INSIDE

            tagged_morph.append((char, tag))

        return tagged_morph

    all_tagged = []
    for morpheme in segments:
        tagged_morph = get_tags(morpheme)
        for char, tag in tagged_morph:
            all_tagged.append([char, tag])
    return all_tagged

In [24]:
def get_morphsegs(langdict):
    morphsegs = []
    morphsplits=[[i for i in x.split()] for x in langdict["morphemes"]]
    glosssplits =[[stem2tag(i.split("-")) for i in x.split()] for x in langdict["gloss"]]

    for m,g in zip(morphsplits,glosssplits):
        for n, _ in enumerate(m):
            
            morphsegs.append((m[n].split("-"), g[n]))
    return morphsegs

In [25]:
def get_bios_gts(morphsegs):
    bios=[]
    sid=0

    for i in morphsegs:
        word_bios = []
        ms, gt = i

        charbies = get_all_tags(ms)

        for i in range(len(ms)):


            msl = len(ms[i])
            gtms = gt[i]

            #bies_and_glosstag = [c + [gtms] for c in charbies[sid:sid+msl]]
            for c in charbies[sid:sid+msl]:
                word_bios.append(c + [gtms])
            sid+= msl
        bios.append(word_bios)

        sid=0
    return bios
        
        

In [26]:
lez_train, lez_dev = get_langdict("Lezgi")
lez_tr_morphsegs = get_morphsegs(lez_train)
lez_dev_morphsegs = get_morphsegs(lez_dev)
lez_tr_biotags = get_bios_gts(lez_tr_morphsegs)
lez_dev_biotags = get_bios_gts(lez_dev_morphsegs)


In [27]:
lez_dev_biotags[0]

[['"', 'SINGLE', 'STEM']]

In [28]:
def get_biolabelled(biotags):
    biolabelled = []
    for i,_ in enumerate(biotags):
        word_biolabelled = []
        for bt in biotags[i]:
            tagged = [bt[0], bt[1] + "-" + bt[2]]
            word_biolabelled.append(tagged)
        biolabelled.append(word_biolabelled)
    return biolabelled
        

In [29]:
lez_tr_biolabs = get_biolabelled(lez_tr_biotags)
lez_dev_biolabs = get_biolabelled(lez_dev_biotags)

In [30]:
lez_tr_biolabs[0]

[['ф', 'BEGIN-STEM'], ['у', 'END-STEM']]

In [31]:
BOUNDARY="<BD>"

def char2features(example, i):
    return [
        f"char_at_{i} = {example[i][0]}",
        f"substring_left={''.join(list(zip(*example[i-2:i+1]))[0]) if i-2 >= 0 else BOUNDARY}",
        f"substring_right={ ''.join(list(zip(*example[i:i+2]))[0]) if i+1 < len(example) else BOUNDARY}",
        f"trigram_right={''.join(list(zip(*example[i:i+3]))[0]) if i+2 < len(example) else BOUNDARY}",
        f"left_dist={i}",
        f"right_dist={len(example)-1-i}",
        f"is_alpha={example[i][0].isalpha()}",
        f"left_char={example[i-1][0] if i-1 >= 0 else BOUNDARY}",
        f"left_bigram={example[i-2][0] if i-2 >= 0 else BOUNDARY}",
        f"right_char={example[i+1][0] if i+1 < len(example) else BOUNDARY}",
        f"right_bigram={example[i+2][0] if i+2 < len(example) else BOUNDARY }",
        ]

def data2features(data):
    """ Extract features for a data set in BIES format. """
    return [[char2features(example,i) for i in range(len(example))] for example in data]

def data2labels(data):
    """ Extract the tags from a data set in BIES format. """
    return [[tok[1] for tok in example] for example in data]

# Initialize the training, development and test sets for pycrfsuite.
X_train = data2features(lez_tr_biolabs)
y_train = data2labels(lez_tr_biolabs)

X_dev = data2features(lez_dev_biolabs)
y_dev = data2labels(lez_dev_biolabs)

In [94]:
BOUNDARY="<BD>"

example=lez_tr_biolabs[3]
print(example)
i=1
print(example[i][0])
f"bigram_left={''.join(list(zip(*example[i-1:i+1]))[0]) if i >0  else BOUNDARY}"
#f"trigram_right={''.join(list(zip(*example[i:i+3]))[0]) if i+2 < len(example) else BOUNDARY}"

[['г', 'BEGIN-STEM'], ['ь', 'INSIDE-STEM'], ['а', 'INSIDE-STEM'], ['т', 'INSIDE-STEM'], ['у', 'INSIDE-STEM'], ['н', 'END-STEM'], ['н', 'BEGIN-AOC'], ['а', 'END-AOC']]
ь


'bigram_left=гь'

In [95]:


def char2features(example, i):
    return [
        f"char_at_{i} = {example[i][0]}",
        f"substring_left={''.join(list(zip(*example[i-2:i+1]))[0]) if i-2 >= 0 else BOUNDARY}",
        f"substring_right={ ''.join(list(zip(*example[i:i+2]))[0]) if i+1 < len(example) else BOUNDARY}",
        
        f"trigram_right={''.join(list(zip(*example[i:i+3]))[0]) if i+2 < len(example) else BOUNDARY}",
        f"trigram_left={''.join(list(zip(*example[i-2:i+1]))[0]) if i >=2  else BOUNDARY}",
        
        f"bigram_right={''.join(list(zip(*example[i:i+2]))[0]) if i+1 < len(example) else BOUNDARY}",
        f"bigram_left={''.join(list(zip(*example[i-1:i+1]))[0]) if i >0  else BOUNDARY}",
        
        f"right_dist={len(example)-1-i}",
        f"left_dist={i}",
        
        f"is_alpha={example[i][0].isalpha()}",
        f"left_char={example[i-1][0] if i-1 >= 0 else BOUNDARY}",
        
        f"right_char={example[i+1][0] if i+1 < len(example) else BOUNDARY}",
        ]

def data2features(data):
    """ Extract features for a data set in BIES format. """
    return [[char2features(example,i) for i in range(len(example))] for example in data]

def data2labels(data):
    """ Extract the tags from a data set in BIES format. """
    return [[tok[1] for tok in example] for example in data]

# Initialize the training, development and test sets for pycrfsuite.
X_train = data2features(lez_tr_biolabs)
y_train = data2labels(lez_tr_biolabs)

X_dev = data2features(lez_dev_biolabs)
y_dev = data2labels(lez_dev_biolabs)

In [98]:
X_train[1]

[['char_at_0 = к',
  'substring_left=<BD>',
  'substring_right=ку',
  'trigram_right=кун',
  'trigram_left=<BD>',
  'bigram_right=ку',
  'bigram_left=<BD>',
  'right_dist=5',
  'left_dist=0',
  'is_alpha=True',
  'left_char=<BD>',
  'right_char=у'],
 ['char_at_1 = у',
  'substring_left=<BD>',
  'substring_right=ун',
  'trigram_right=уни',
  'trigram_left=<BD>',
  'bigram_right=ун',
  'bigram_left=ку',
  'right_dist=4',
  'left_dist=1',
  'is_alpha=True',
  'left_char=к',
  'right_char=н'],
 ['char_at_2 = н',
  'substring_left=кун',
  'substring_right=ни',
  'trigram_right=нил',
  'trigram_left=кун',
  'bigram_right=ни',
  'bigram_left=ун',
  'right_dist=3',
  'left_dist=2',
  'is_alpha=True',
  'left_char=у',
  'right_char=и'],
 ['char_at_3 = и',
  'substring_left=уни',
  'substring_right=ил',
  'trigram_right=ила',
  'trigram_left=уни',
  'bigram_right=ил',
  'bigram_left=ни',
  'right_dist=2',
  'left_dist=3',
  'is_alpha=True',
  'left_char=н',
  'right_char=л'],
 ['char_at_4 = л',


In [32]:
trainer = pycrfsuite.Trainer(verbose=True)

# your code here
for s_X, s_y in zip(X_train, y_train): 
    try:
        trainer.append(s_X, s_y)
    except UnicodeEncodeError:
        continue

trainer.set_params({
    "c1": 0.1,
    "c2": 0.1,
    "max_iterations": 50,
    "feature.possible_transitions": True
})

trainer.train("segmentation_crf.pycrfsuite")

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 47069
Seconds required: 0.177

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 50
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 135796.017135
Feature norm: 1.000000
Error norm: 24788.104079
Active features: 46953
Line search trials: 1
Line search step: 0.000040
Seconds required for this iteration: 5.218

***** Iteration #2 *****
Loss: 95663.281938
Feature norm: 11.486903
Error norm: 20777.000555
Active features: 46950
Line search trials: 4
Line search step: 0.125000
Seconds required for this iteration: 9.697

***** Iteration #3 *****
Loss: 62904.434672
Feature norm: 11.502086
Error norm: 12515.244670
Active features: 45881
Line search trials: 1
Line search step: 1.000000
Seconds required f

In [37]:

def segment(data,tagger):
    tagged_data = []
    for i,example in enumerate(data2features(data)):
        tags = tagger.tag(example)
        tagged_example = [(char, tag) for (char, _), tag in zip(data[i],tags)]
        tagged_data.append(tagged_example)
    return tagged_data

In [35]:
tagger = pycrfsuite.Tagger()
tagger.open('segmentation_crf.pycrfsuite')



<contextlib.closing at 0x128a29390>

In [60]:
tagged_data = []
for i,ex in enumerate(data2features(lez_dev_biolabs)):
    #word = ""
    tags = tagger.tag(ex)
    #print(tags)
    tagged_word = lez_dev_biolabs[i]
    word = [tagged_word[i][0] for i,_ in enumerate(tagged_word)]
    # for i,_ in enumerate(tagged_word):
    #     word += tagged_word[i][0]
    tagged_data.append((word, tags))


In [68]:

for chars, tags in tagged_data:
    morph = ""
    
    bies = [tag.split("-")[0] for tag in tags]
    gloss = [tag.split("-")[1] for tag in tags]
    print((chars, bies, gloss))
#     for char, bietag, glosstag in zip(chars, bies,gloss):
#         print((char,bietag,glosstag))
    

(['"'], ['SINGLE'], ['STEM'])
(['з', 'у', 'н'], ['BEGIN', 'INSIDE', 'END'], ['STEM', 'STEM', 'STEM'])
(['"', ','], ['BEGIN', 'END'], ['STEM', 'STEM'])
(['л', 'а', 'г', 'ь', 'а', 'н', 'а'], ['BEGIN', 'INSIDE', 'INSIDE', 'INSIDE', 'END', 'BEGIN', 'END'], ['STEM', 'STEM', 'STEM', 'STEM', 'STEM', 'AOR', 'AOR'])
([','], ['SINGLE'], ['STEM'])
(['"'], ['SINGLE'], ['STEM'])
(['ф', 'е', 'н', 'а'], ['BEGIN', 'END', 'BEGIN', 'END'], ['STEM', 'STEM', 'AOR', 'AOR'])
(['и', 'н', 'с', 'а', 'н', 'д', 'и', 'н'], ['BEGIN', 'INSIDE', 'INSIDE', 'INSIDE', 'END', 'BEGIN', 'END', 'SINGLE'], ['STEM', 'STEM', 'STEM', 'STEM', 'STEM', 'ERG', 'ERG', 'GEN'])
(['а', 'р', 'а', 'д', 'а', 'е'], ['BEGIN', 'INSIDE', 'INSIDE', 'INSIDE', 'END', 'SINGLE'], ['STEM', 'STEM', 'STEM', 'STEM', 'STEM', 'INESS'])
(['г', 'ь', 'а', 'т', 'д', 'а', 'д', 'а'], ['BEGIN', 'INSIDE', 'INSIDE', 'INSIDE', 'INSIDE', 'END', 'BEGIN', 'END'], ['STEM', 'STEM', 'STEM', 'STEM', 'STEM', 'STEM', 'ENT', 'ENT'])
([','], ['SINGLE'], ['STEM'])
(['а', 'к

In [38]:
supervised_tokenized_dev = segment(lez_dev_biolabs,tagger)

In [49]:
for word in supervised_tokenized_dev:
    mor

[[('"', 'SINGLE-STEM')],
 [('з', 'BEGIN-STEM'), ('у', 'INSIDE-STEM'), ('н', 'END-STEM')],
 [('"', 'BEGIN-STEM'), (',', 'END-STEM')],
 [('л', 'BEGIN-STEM'),
  ('а', 'INSIDE-STEM'),
  ('г', 'INSIDE-STEM'),
  ('ь', 'INSIDE-STEM'),
  ('а', 'END-STEM'),
  ('н', 'BEGIN-AOR'),
  ('а', 'END-AOR')],
 [(',', 'SINGLE-STEM')],
 [('"', 'SINGLE-STEM')],
 [('ф', 'BEGIN-STEM'),
  ('е', 'END-STEM'),
  ('н', 'BEGIN-AOR'),
  ('а', 'END-AOR')],
 [('и', 'BEGIN-STEM'),
  ('н', 'INSIDE-STEM'),
  ('с', 'INSIDE-STEM'),
  ('а', 'INSIDE-STEM'),
  ('н', 'END-STEM'),
  ('д', 'BEGIN-ERG'),
  ('и', 'END-ERG'),
  ('н', 'SINGLE-GEN')],
 [('а', 'BEGIN-STEM'),
  ('р', 'INSIDE-STEM'),
  ('а', 'INSIDE-STEM'),
  ('д', 'INSIDE-STEM'),
  ('а', 'END-STEM'),
  ('е', 'SINGLE-INESS')],
 [('г', 'BEGIN-STEM'),
  ('ь', 'INSIDE-STEM'),
  ('а', 'INSIDE-STEM'),
  ('т', 'INSIDE-STEM'),
  ('д', 'INSIDE-STEM'),
  ('а', 'END-STEM'),
  ('д', 'BEGIN-ENT'),
  ('а', 'END-ENT')],
 [(',', 'SINGLE-STEM')],
 [('а', 'BEGIN-STEM'),
  ('к', 'INSIDE-

In [46]:
lez_dev_biolabs[1]

[['з', 'BEGIN-STEM'], ['у', 'INSIDE-STEM'], ['н', 'END-STEM']]