In [67]:
import numpy as np
import os
from conllu import parse
import math

INIT TAG SETS

In [68]:
tags_g = ['ADJ','ADP','ADV','CCONJ','DET','INTJ','NOUN','NUM','PART','PRON','PUNCT','SCONJ','VERB','X'];
tags_l = ['ADJ','ADP','ADV','AUX','CCONJ','DET','NOUN','NUM','PART','PRON','PROPN','PUNCT','SCONJ','VERB','X']; 

COUNTING TAG AND WORD'S TAG

In [69]:
def count(word, tag, tagset, count_words, count_tag):
    index_of_tag = tagset.index(tag);
    totIndex = len(tagset);
    # update count of tag
    count_tag[index_of_tag] = count_tag[index_of_tag] + 1;
    # update count of word
    if(word in count_words.keys()):
        count_words[word][index_of_tag] = count_words[word][index_of_tag] + 1;
    else:
        word_row = np.zeros(len(tagset) + 1, dtype=int);
        word_row[index_of_tag] = 1;
        count_words[word] = word_row;
    # update total count of word
    count_words[word][totIndex] = count_words[word][totIndex] + 1;
    # update total count of tag
    count_tag[totIndex] = count_tag[totIndex] + 1;
    return count_words, count_tag;

CALCULATE EMISSION PROBABILITY

In [70]:
def calculateEmissionProbability(word, tag, tagset, count_words, count_tag ,probEmission):
    index_of_tag = tagset.index(tag);
    # index for total count
    totIndex = len(tagset);
    #if word exist then update count
    if(word in probEmission.keys()):
        probEmission[word][index_of_tag] = count_words[word][index_of_tag] / count_tag[index_of_tag];
    #if NTOT word exist then create row
    else:
        prob_row = np.zeros(len(tagset) + 1);
        prob_row[index_of_tag] = count_words[word][index_of_tag] / count_tag[index_of_tag];
        probEmission[word] = prob_row;
    return probEmission;

CALCULATE TRANSITION PROBABILITY

In [71]:
def calculateTransictionProbability(word, prev_tag, tag, tagset, transition, count_tag, nSentences, probTransition):
    #natural sequence when scan senteces
    trans_tag = "%s_%s" % (prev_tag,tag);
    #Sequence used to saved
    trans_tag_real = "%s_%s" % (tag,prev_tag);
    
    if(trans_tag in transition.keys()):
        transition[trans_tag] = transition[trans_tag] + 1;
    else:
        transition[trans_tag] = 1;

    #When tag is Q0 (start) the calculate probTransistion with number of sentences
    #Else use normal Transition probability
    if(trans_tag in probTransition.keys() and prev_tag != 'Q0'):
        index_of_tag = tagset.index(prev_tag);
        probTransition[trans_tag_real] = transition[trans_tag] /count_tag[index_of_tag];
    else:
        probTransition[trans_tag_real] = transition[trans_tag]/ nSentences;

    return tag, probTransition;

In [72]:
def readFile(language,fileType):
    nameFile = "./corpus/%s/data_%s.conllu" % (language,fileType);
    tsv_file = open(nameFile,"r",encoding="utf-8").read();
    sentences = parse(tsv_file)
    return sentences;

In [85]:
def train(language):
    #select tag sets of language choosen
    if language == "greek":
        tagset = tags_g;
    elif language == "latin":
        tagset = tags_l;
    else:
        raise Exception("Language not found!");

    # INIT DATA STRUCTURE
    count_words = dict();
    count_tag = np.zeros(len(tagset) + 1, dtype = int);
    probEmission = dict();
    probTransition = dict();
    transition = dict();

    #Count number of sentence for calculate probTransistio with tag start Q0
    nSentences = 0;

    sentences = readFile(language,"train")
    for sentence in sentences:
        prev_tag = 'Q0';
        nSentences = nSentences + 1;
        for token in sentence:
            word = token["form"];
            tag = token["upos"];
            count_words, count_tag = count(word, tag, tagset, count_words, count_tag);
            probEmission = calculateEmissionProbability(word,tag,tagset, count_words, count_tag, probEmission);
            prev_tag, probTransition = calculateTransictionProbability(word, prev_tag, tag, tagset, transition, count_tag, nSentences, probTransition)
    return sentences,tagset, probEmission, probTransition;

DECODING WITH VITERBI

In [74]:
def tokenize_sentence(sentence):
    words = [];
    for token in sentence:
        words.append(token["form"]);
    return words

In [75]:
def viterbi(sentence, tagset, probEmission, probTransition):
    words = tokenize_sentence(sentence);
    start_tag = "Q0";
    viterbi_matrix = np.zeros((len(tagset),len(words)));
    backtrace = np.zeros(len(words), dtype = int);
    probabilites = np.zeros(len(words));
    t = 0;
   
    for word in words:
        # Calculate viterbi column for every tag possible
        for tag in tagset:
            index_of_tag = tagset.index(tag);
            #Get Emission probabilty of word 
            probE = probEmission[word][index_of_tag];
            #Run first iteration of viterbi to initialize first column
            if t == 0:
                tran_tag = "%s_%s" % (tag,start_tag);   
                probT = 0
                if tran_tag in probTransition.keys():
                    probT = probTransition[tran_tag];
                viterbi_matrix[index_of_tag][t] = probE * probT;
            else:
                max_tmp = np.zeros(len(tagset));
                for i in range(0,len(tagset)):
                    tran_tag = "%s_%s" % (tag,tagset[i]);
                    probT = 0
                    if tran_tag in probTransition.keys():
                        probT = probTransition[tran_tag];

                    max_tmp[i] = viterbi_matrix[i,t-1] * probT;
                viterbi_matrix[index_of_tag,t] = np.amax(max_tmp) * probE;

        index_max_values = np.argmax(viterbi_matrix[:,t]);  
        backtrace[t] = index_max_values;
        probabilites[t] = viterbi_matrix[index_max_values,t];
        t= t +1;
    return backtrace,probabilites;

In [76]:
def printPosTag(sentence, tagset, backtrace , probabilities):
    i = 0;
    words = tokenize_sentence(sentence);
    for word in words:
        print("WORD -> " + word + " , TAG -> " + tagset[backtrace[i]] + " , WITH PROB -> " + str(probabilities[i]))
        i = i + 1;

TRAIN AND DECODING GREEK

In [87]:
sentences_greek, tagset_greek, probEmission_greek, probTransition_greek =  train("greek");


In [88]:
print("==================== TEST ON GREEK ================================== ")

for i in range(0,5):
    print("==================== new sentence ================================== ")
    backtrace_greek, probabilities_greek = viterbi(sentences_greek[i], tagset_greek, probEmission_greek, probTransition_greek);
    printPosTag(sentences_greek[i], tagset_greek, backtrace_greek, probabilities_greek);

WORD -> ἐρᾷ , TAG -> VERB , WITH PROB -> 9.435169962449398e-05
WORD -> μὲν , TAG -> PART , WITH PROB -> 3.0217439910802045e-07
WORD -> ἁγνὸς , TAG -> ADJ , WITH PROB -> 6.3622033860714765e-12
WORD -> οὐρανὸς , TAG -> NOUN , WITH PROB -> 4.647743328610378e-16
WORD -> τρῶσαι , TAG -> VERB , WITH PROB -> 6.075481475307685e-17
WORD -> χθόνα , TAG -> NOUN , WITH PROB -> 6.873014278609289e-21
WORD -> , , TAG -> PUNCT , WITH PROB -> 6.3728020354193885e-22
WORD -> ἔρως , TAG -> NOUN , WITH PROB -> 9.112531392721117e-27
WORD -> δὲ , TAG -> PART , WITH PROB -> 8.88091549905803e-29
WORD -> γαῖαν , TAG -> NOUN , WITH PROB -> 3.223942481827329e-32
WORD -> λαμβάνει , TAG -> VERB , WITH PROB -> 2.809535931875668e-33
WORD -> γάμου , TAG -> NOUN , WITH PROB -> 1.1672227212595416e-37
WORD -> τυχεῖν , TAG -> VERB , WITH PROB -> 9.961575636429551e-42
WORD -> · , TAG -> PUNCT , WITH PROB -> 7.0319855982484075e-43
WORD -> ὄμβρος , TAG -> NOUN , WITH PROB -> 2.559161907362648e-05
WORD -> δ̓ , TAG -> PART , W

TRAIN AND DECODING LATIN

In [80]:
sentences_latin, tagset_latin, probEmission_latin, probTransition_latin =  train("latin");


In [81]:
print("==================== TEST ON LATIN ================================== ")
backtrace_latin, probabilities_latin = viterbi(sentences_latin[0], tagset_latin, probEmission_latin, probTransition_latin);
printPosTag(sentences_latin[0], tagset_latin, backtrace_latin, probabilities_latin);

WORD -> + , TAG -> PUNCT , WITH PROB -> 0.0477907757517123
WORD -> In , TAG -> ADP , WITH PROB -> 7.95175730713278e-05
WORD -> Dei , TAG -> PROPN , WITH PROB -> 6.730600719867964e-07
WORD -> nomine , TAG -> NOUN , WITH PROB -> 3.957304376743928e-09
WORD -> regnante , TAG -> VERB , WITH PROB -> 9.154634916135536e-12
WORD -> domno , TAG -> NOUN , WITH PROB -> 1.1573354564099308e-14
WORD -> nostro , TAG -> DET , WITH PROB -> 6.188509865711393e-17
WORD -> Carulo , TAG -> PROPN , WITH PROB -> 5.469200602474012e-20
WORD -> rege , TAG -> NOUN , WITH PROB -> 7.500897305500222e-23
WORD -> Francorum , TAG -> NOUN , WITH PROB -> 2.1081007115471513e-26
WORD -> et , TAG -> CCONJ , WITH PROB -> 1.0864475686704603e-27
WORD -> Langobardorum , TAG -> NOUN , WITH PROB -> 6.444268799442303e-31
WORD -> , , TAG -> PUNCT , WITH PROB -> 7.124095018424678e-32
WORD -> anno , TAG -> NOUN , WITH PROB -> 2.2636869354322035e-34
WORD -> regni , TAG -> NOUN , WITH PROB -> 1.101864558019235e-37
WORD -> eius , TAG -> 

SMOOTHING