In [1]:
# from ufal.udpipe import Model as udModel, Pipeline
import re
import alternation

In [2]:
# model_path = "finnish-tdt-ud-2.5-191206.udpipe"
# ud_model = udModel.load(model_path)

In [2]:
def process_psor(token, person, *, number={'Sing'}):
    """
    Erases possesive suffix
    token - string, word form
    number - set which contains string 'Sing' or 'Plur'
    person - set which contains string
    """
    output = token
    if '1' in person:
        if 'Sing' in number:
            output = re.sub(r"ni$", "", token)
        elif 'Plur' in number:
            output = re.sub(r"mme$", "", token)
    elif '2' in person:
        if 'Sing' in number:
            output = re.sub(r"si$", "", token)
        elif 'Plur' in number:
            output = re.sub(r"nne$", "", token)
    elif '3' in person:
        output = re.sub(r"((ns[aä])|([aeiouäöy]n))$", "", token)  
    return output
            

def process_case(token, case, number={'Sing'}):
    """
    deletes case marker
    token - string, word form
    case - set which contains string
    number - set which contains string 'Sing' or 'Plur'
    """
    output = token
    if 'Abl' in case:
        output = re.sub(r"lt[aä]$", "", token)
    elif 'Ade' in case:
        output = re.sub(r"ll[aä]$", "", token)
    elif 'All' in case:
        output = re.sub(r"lle$", "", token)
    elif 'Ela' in case:
        output = re.sub(r"st[aä]$", "", token)
    elif 'Ess' in case:
        output = re.sub(r"n[aä]$", "", token)
    elif 'Gen' in case:
        if 'Sing' in number:
            output = re.sub(r"n?$", "", token)
        elif 'Plur' in number:
            output = re.sub(r"(tt|d|t)?en$", "", token)
    elif 'Ill' in case:
        output = re.sub(r"(?<=ee)seen?$", "", token)
        output = re.sub(r"h?[aeiouäöy]n?$", "", output)
    elif 'Ine' in case:
        output = re.sub(r"ss[aä]$", "", token)
    elif 'Par' in case:
        output = re.sub(r"(t?[aä])$", "", token)
    elif 'Tra' in case:
        output = re.sub(r"ksi$", "", token)
    elif 'Nom' in case and 'Plur' in number:
        output = re.sub(r"t$", "", token)
    return output

In [24]:
def process_number(word, case, number={'Sing'}):
    output = word
    if len(output) < 2:
        return output
    if 'Sing' in number or ('Nom' in case and 'Plur' in number):
        if 'Nom' in case and 'Sing' in number:
            return output
        
        if 'Par' in case:
            output = re.sub(r"(?<=i)s$", "nen", output)
            if output != word:
                return output
            output = re.sub(r"([hlrn])$", r"\1i", output)
            output = re.sub(r"([a-zäö]{2,}[uy])t$", r"\1s", output)
            output = re.sub(r"e$", r"i", output)
            output = re.sub(r"([a-zäö]{2,}e)t$", r"\1", output)
            output = re.sub(r"t$", "si", output)
            return output
            
        # блок основ с обратными чередованиями при образовании местных падежей
        output = re.sub(r"ttoma$", "ton", output)
        output = re.sub(r"ttömä$", "tön", output)
        output = re.sub(r"(e[nlr]|[aä]r)e$", r"*\1", output)
#         output = re.sub(r"ttar$", "tar", output)
#         output = re.sub(r"ttär$", "tär", output)
        output = re.sub(r"ime$", r"*in", output)
        output = re.sub(r"ii$", r"*is", output)
        output = re.sub(r"([a-z]{3,})aa$", r"\1*as", output)
        output = re.sub(r"([a-zäö]{3,})ää$", r"\1*äs", output)
        # магия сингармонизма гласных
        output = re.sub(r"([qwrtuopasdfghjklzxcvbnm]{3,})nee$", r"\1nut", output)
        output = re.sub(r"([a-zäö]{3,})nee", r"\1nyt", output)
        output = re.sub(r"ee$", r"*e", output)
        output = alternation.alternation_forward(output)
        if output != word:
            return output
        
        # разные типы на -s
        output = re.sub(r"kse$", "s", output)
        output = re.sub(r"([a-zäö]{2,}[aeiouyäö])de$", r"\1s", output)
        output = re.sub(r"([a-zäö]{2,}[aeiouyäö])te$", r"\1s", output) # Mihin
        if output != word:
            return output
        
        output = re.sub(r"([a-zäö]{2,}u|y)e$", r"\1t", output)
        output = re.sub(r"se$", "nen", output)
        output = re.sub(r"de$", "si", output)
        output = re.sub(r"te$", "si", output)
        if output != word:
            return output
        
        if 'Nom' not in case and 'Par' not in case and 'Ill' not in case and 'Ess' not in case:
            output = re.sub(r"([a-zäö]{2,})([aeiouyäö])$", r"\1*\2", output)
            output = alternation.alternation_backward(output)
        output = re.sub(r"e$", "i", output)
    
    elif 'Plur' in number and 'Nom' not in case:

        #  с генитивом беда - надо отдельно разобрать

        # other cases
        output = re.sub(r"^planeetto(i$|j$)", "planeetta", output)
        output = re.sub(r"^lapsi$", "lapsi", output)
        output = re.sub(r"^miehi$", "mies", output)

        # нормальные ребята, которые точно могут быть получены из мн. ч.
        # ребята с -ei- or -ej-
        output = re.sub(r"(?<=un)(ei|ej)$", "ut", output)
        output = re.sub(r"(?<=yn)(ei|ej)$", "yt", output)

        output = re.sub(r"(?<=(tt|lt|rk|mp|nk|kk|nt|pp|rt))(ei|ej)$", "e", output)
        output = alternation.alternation_forward(output)
        if output != word:
            return output
            # заимствованные слова на -i- - не знаю, что делвть с их чередованиями, смотри об этом ниже
        output = re.sub(r"(ei|ej)$", "i", output)

        output = re.sub(r"(?<=i)si$", "nen", output)

        # слова типа uusi отбросом окончанния тут готовы
        output = re.sub(r"ksi$", "s", output)
        output = re.sub(r"(?<=i)mi$", "n", output)
        # слова типа kieli pieni etc. автоматом получаются отсечением падежа
        output = re.sub(r"ttomi$", "ton", output)
        output = re.sub(r"ttömi$", "tön", output)
        output = re.sub(r"ttari$", "tar", output)
        output = re.sub(r"ttäri$", "tän", output)
        output = re.sub(r"(yi|ui)$", r"\1t", output)
        if output != word:
            return output
        output = re.sub(r"(?<=(l|n|r|k))(oj|oi)$", "a", output)  # backward
        output = re.sub(r"(?<=(l|n|r|k))(öj|öi)$", "ä", output)  # backward
        output = alternation.alternation_backward(output)
        if output != word:
            return output

        output = re.sub(r"(?<=(ij|kk))(oj|oi)$", "a", output)  # backward
        output = re.sub(r"(?<=(ij|kk))(öj|öi)$", "ä", output)  # backward


        output = re.sub(r"([a-z]{3,})ai$", r"\1*as", output)
        output = re.sub(r"([a-zäö]{3,})äi$", r"\1*äs", output)
        output = alternation.alternation_forward(output)
        if output != word:
            return output

        output = re.sub(r"ii$", r"*is", output)

        output = re.sub(r"([a-z]{2,})o(j|i)$", r"\1a", output)
        output = re.sub(r"([a-zäö]{2,})ö(j|i)$", r"\1ä", output)

        output = re.sub(r"([a-z]{4,})i$", r"\1a", output)
        output = re.sub(r"([a-zäö]{4,})i$", r"\1ä", output)


        # все остальные случаи с сильными гласными
        output = re.sub(r"(i|j)$", r"", output)
        
    return output

In [4]:
def process_token(token):
    output = token.form
    output = output.replace('-', '').replace(':', '')
    if len(output) > 4 and output.isupper():
        output = output.lower()
    if 'Clitic' in token.feats:
        output = re.sub(r"kin$", "", output)
    if 'Person[psor]' in token.feats:
        if 'Number[psor]' in token.feats:
            output = process_psor(output, token.feats['Person[psor]'], number=token.feats['Number[psor]'])
        else:
            output = process_psor(output, person=token.feats['Person[psor]'])
    if 'Case' in token.feats:
        case = token.feats['Case']
        if 'Number' in token.feats:
            number = token.feats['Number']
            output = process_case(output, case, number)
            output = process_number(output, case, number)
        else:
            output = process_case(output, case)
            output = process_number(output, case)
    return output

### Тест модели на заведомо корректных данных

In [25]:
import pyconll

sents = pyconll.load_from_file("fi_tdt-ud-test.conllu")
answer = []
error = []
nouns_all = 0
nouns_correct = 0
for sentence in sents:
    for token in sentence:
        if token.xpos in ['A', 'N']:
            if 'Number' in token.feats and 'Plur' in token.feats['Number']:
                nouns_all += 1
                processed = process_token(token)
                correct_ending = token.lemma.replace('#', '')
                if processed.lower() == correct_ending.lower():
                    nouns_correct += 1
                    answer.append((token.form, processed))
                else:
                    error.append((token.form, correct_ending, processed))
                    
nouns_all, nouns_correct

(1926, 1091)

In [26]:
error[:100]

[('liput', 'lippu', 'lipu'),
 ('tupaantuliaiset', 'tupaantuliaiset', 'tupaantuliainen'),
 ('uusien', 'uusi', 'uus'),
 ('häissä', 'häät', 'hä'),
 ('hautajaisissa', 'hautajaiset', 'hautajainen'),
 ('Tupaantuliaisista', 'tupaantuliaiset', 'Tupaantuliainen'),
 ('ihmisten', 'ihminen', 'ihmis'),
 ('nimiä', 'nimi', 'nin'),
 ('ammatteja', 'ammatti', 'ammatte'),
 ('nimille', 'nimi', 'nin'),
 ('tutuille', 'tuttu', 'tutuit'),
 ('puhtain', 'puhdas', 'puhtain'),
 ('sydämin', 'sydän', 'sydämin'),
 ('parantamisvinkkeihin', 'parantamisvinkki', 'parantamisvinkke'),
 ('tietoja', 'tieto', 'tieta'),
 ('naisten', 'nainen', 'nais'),
 ('eroja', 'ero', 'era'),
 ('päällisin', 'päällinen', 'päällisin'),
 ('puolin', 'puoli', 'puolin'),
 ('parisuhteista', 'parisuhde', 'parisuhti'),
 ('nuoremmille', 'nuori', 'nuoremma'),
 ('toiveistaan', 'toive', 'toivi'),
 ('vuosia', 'vuosi', 'vuosa'),
 ('vuosikymmeniä', 'vuosikymmen', 'vuosikymmena'),
 ('toiveita', 'toive', 'toivi'),
 ('lähtökohdiltaan', 'lähtökohta', 'lähtökohd