In [99]:
import numpy as np
import os
from conllu import parse
import math

INIT TAG SETS

In [100]:
tags_g = ['ADJ','ADP','ADV','CCONJ','DET','INTJ','NOUN','NUM','PART','PRON','PUNCT','SCONJ','VERB','X'];
tags_l = ['ADJ','ADP','ADV','AUX','CCONJ','DET','NOUN','NUM','PART','PRON','PROPN','PUNCT','SCONJ','VERB','X'];

COUNTING TAG AND WORD'S TAG

In [101]:
def count(word, tag, tagset, count_words, count_tag):
    index_of_tag = tagset.index(tag);
    totIndex = len(tagset);
    # update count of tag
    count_tag[index_of_tag] = count_tag[index_of_tag] + 1;
    # update count of word
    if(word in count_words.keys()):
        count_words[word][index_of_tag] = count_words[word][index_of_tag] + 1;
    else:
        word_row = np.zeros(len(tagset) + 1, dtype=int);
        word_row[index_of_tag] = 1;
        count_words[word] = word_row;
    # update total count of word
    count_words[word][totIndex] = count_words[word][totIndex] + 1;
    # update total count of tag
    count_tag[totIndex] = count_tag[totIndex] + 1;
    return count_words, count_tag;

CALCULATE EMISSION PROBABILITY

In [102]:
def calculateEmissionProbability(word, tag, tagset, count_words, count_tag ,probEmission):
    index_of_tag = tagset.index(tag);
    # index for total count
    totIndex = len(tagset);
    #if word exist then update count
    if(word in probEmission.keys()):
        probEmission[word][index_of_tag] = count_words[word][index_of_tag] / count_tag[index_of_tag];
    #if NTOT word exist then create row
    else:
        prob_row = np.zeros(len(tagset) + 1);
        prob_row[index_of_tag] = count_words[word][index_of_tag] / count_tag[index_of_tag];
        probEmission[word] = prob_row;
    return probEmission;

CALCULATE TRANSITION PROBABILITY

In [103]:
def calculateTransictionProbability(word, prev_tag, tag, tagset, transition, count_tag, nSentences, probTransition):
    #natural sequence when scan senteces
    trans_tag = "%s_%s" % (prev_tag,tag);
    #Sequence used to saved
    trans_tag_real = "%s_%s" % (tag,prev_tag);
    
    if(trans_tag in transition.keys()):
        transition[trans_tag] = transition[trans_tag] + 1;
    else:
        transition[trans_tag] = 1;

    #When tag is Q0 (start) the calculate probTransistion with number of sentences
    #Else use normal Transition probability
    if(trans_tag in probTransition.keys() and prev_tag != 'Q0'):
        index_of_tag = tagset.index(prev_tag);
        probTransition[trans_tag_real] = transition[trans_tag] /count_tag[index_of_tag];
    else:
        probTransition[trans_tag_real] = transition[trans_tag]/ nSentences;

    return tag, probTransition;

In [104]:
def readFile(language,fileType):
    nameFile = "./corpus/%s/data_%s.conllu" % (language,fileType);
    tsv_file = open(nameFile,"r",encoding="utf-8").read();
    sentences = parse(tsv_file)
    return sentences;

In [105]:
def train(language,statisticsIsEnabled):
    #select tag sets of language choosen
    if language == "greek":
        tagset = tags_g;
    elif language == "latin":
        tagset = tags_l;
    else:
        raise Exception("Language not found!");

    # INIT DATA STRUCTURE
    count_words = dict();
    count_tag = np.zeros(len(tagset) + 1, dtype = int);
    probEmission = dict();
    probTransition = dict();
    transition = dict();
    statistics = np.zeros(len(tagset));

    if statisticsIsEnabled == True:
        statistics = calculateStatisticPosTagging(language)

    #Count number of sentence for calculate probTransistio with tag start Q0
    nSentences = 0;

    sentences = readFile(language,"train")
    for sentence in sentences:
        prev_tag = 'Q0';
        nSentences = nSentences + 1;
        for token in sentence:
            word = token["form"];
            tag = token["upos"];
            count_words, count_tag = count(word, tag, tagset, count_words, count_tag);
            probEmission = calculateEmissionProbability(word,tag,tagset, count_words, count_tag, probEmission);
            prev_tag, probTransition = calculateTransictionProbability(word, prev_tag, tag, tagset, transition, count_tag, nSentences, probTransition)
    return sentences,tagset, probEmission, probTransition,statistics;

In [106]:
def tokenize_sentence(sentence):
    words = [];
    for token in sentence:
        words.append(token["form"]);
    return words

CALCULATE STATISTIC POS TAGGIN

In [107]:
def calculateStatisticPosTagging(language):
    #select tag sets of language choosen
    if language == "greek":
        tagset = tags_g;
    elif language == "latin":
        tagset = tags_l;
    else:
        raise Exception("Language not found!");

    count_words = dict();
    count_tag = np.zeros(len(tagset) + 1, dtype = int);
    count_tag_one_occured = np.zeros(len(tagset), dtype = int);
    count_tag_one_occured_total = 0;

    statistics = np.zeros(len(tagset));
    

    sentences = readFile(language,"dev");
    for sentence in sentences:
        for token in sentence:
            word = token["form"];
            tag = token["upos"];
            count_words, count_tag = count(word, tag, tagset, count_words, count_tag);

    totIndex = len(tagset);
    for word in count_words:
        if count_words[word][totIndex] == 1:
            index_of_tag = np.argmax(count_words[word]);
            count_tag_one_occured[index_of_tag] = count_tag_one_occured[index_of_tag] + 1;
            count_tag_one_occured_total =  count_tag_one_occured_total + 1;

    for tag in tagset:
        index_of_tag = tagset.index(tag);
        statistics[index_of_tag] = count_tag_one_occured[index_of_tag] / count_tag_one_occured_total;
    return statistics;

IMPLEMENT SMOOTHING

In [108]:
def selectSmoothing(type, probEmission, word, index_of_tag, tagset, statistics):
    # NO SMOOTHING
    if word in probEmission.keys():
       return probEmission[word][index_of_tag]
    elif type == 0 and word not in probEmission.keys():
        return 0.00001;
    # IF WORD NOT EXIST THEN SET NOUN PROBABILITY 1
    elif type == 1 and word not in probEmission.keys():
        prob_row = np.zeros(len(tagset) + 1);
        index_of_noun = tagset.index("NOUN");
        prob_row[index_of_noun] = 1;
        return prob_row[index_of_tag];
    elif type == 2 and word not in probEmission.keys():
        prob_row = np.zeros(len(tagset) + 1);
        index_of_noun = tagset.index("NOUN");
        index_of_verb = tagset.index("VERB");
        prob_row[index_of_noun] = 0.5;
        prob_row[index_of_verb] = 0.5;
        return prob_row[index_of_tag];
    elif type == 3 and word not in probEmission.keys():
        unk_prob = 1 / len(tagset);
        prob_row = np.full(len(tagset) + 1, unk_prob);
        return prob_row[index_of_tag];
    elif type == 4 and word not in probEmission.keys():
        return statistics[index_of_tag];
    else:
        return 0.00001;

DECODING WITH VITERBI

In [109]:
def viterbi(sentence, tagset, probEmission, probTransition, smoothingType, statistics):
    words = tokenize_sentence(sentence);
    start_tag = "Q0";
    viterbi_matrix = np.zeros((len(tagset),len(words)));
    backtrace = np.zeros(len(words), dtype = int);
    probabilites = np.zeros(len(words));
    t = 0;
   
    for word in words:
        # Calculate viterbi column for every tag possible
        for tag in tagset:
            index_of_tag = tagset.index(tag);
            #Get Emission probabilty of word ( HERE WHEN CAN APPLY SMOOTHING)
            probE = selectSmoothing(smoothingType, probEmission, word, index_of_tag, tagset, statistics);

            #Run first iteration of viterbi to initialize first column
            if t == 0:
                tran_tag = "%s_%s" % (tag,start_tag);   
                probT = 0.00001;
                if tran_tag in probTransition.keys():
                    probT = probTransition[tran_tag];
                viterbi_matrix[index_of_tag][t] = probE * probT;
            else:
                max_tmp = np.zeros(len(tagset));
                for i in range(0,len(tagset)):
                    tran_tag = "%s_%s" % (tag,tagset[i]);
                    probT = 0.00001;
                    if tran_tag in probTransition.keys():
                        probT = probTransition[tran_tag];

                    max_tmp[i] = viterbi_matrix[i,t-1] * probT;
                viterbi_matrix[index_of_tag,t] = np.amax(max_tmp) * probE;

        index_max_values = np.argmax(viterbi_matrix[:,t]);  
        backtrace[t] = index_max_values;
        probabilites[t] = viterbi_matrix[index_max_values,t];
        t= t +1;
    return backtrace,probabilites;

In [110]:
def printPosTag(sentence, tagset, backtrace , probabilities):
    i = 0;
    words = tokenize_sentence(sentence);
    for word in words:
        print("WORD_ROW -> " + word + " " + tagset[backtrace[i]] + "     prob -> " + str(probabilities[i]))
        i = i + 1;

TRAIN AND DECODING GREEK

In [111]:
sentences_greek, tagset_greek, probEmission_greek, probTransition_greek, statistics_greek  =  train("greek",False);


In [112]:
print("==================== TEST ON GREEK ================================== ")

for i in range(0,5):
    print("==================== new sentence ================================== ")
    backtrace_greek, probabilities_greek = viterbi(sentences_greek[i], tagset_greek, probEmission_greek, probTransition_greek, 0, statistics_greek);
    printPosTag(sentences_greek[i], tagset_greek, backtrace_greek, probabilities_greek);

WORD_ROW -> ἐρᾷ VERB     prob -> 9.435169962449398e-05
WORD_ROW -> μὲν PART     prob -> 3.0217439910802045e-07
WORD_ROW -> ἁγνὸς ADJ     prob -> 6.3622033860714765e-12
WORD_ROW -> οὐρανὸς NOUN     prob -> 4.647743328610378e-16
WORD_ROW -> τρῶσαι VERB     prob -> 6.075481475307685e-17
WORD_ROW -> χθόνα NOUN     prob -> 6.873014278609289e-21
WORD_ROW -> , PUNCT     prob -> 6.3728020354193885e-22
WORD_ROW -> ἔρως NOUN     prob -> 9.112531392721117e-27
WORD_ROW -> δὲ PART     prob -> 8.88091549905803e-29
WORD_ROW -> γαῖαν NOUN     prob -> 3.223942481827329e-32
WORD_ROW -> λαμβάνει VERB     prob -> 2.809535931875668e-33
WORD_ROW -> γάμου NOUN     prob -> 1.1672227212595416e-37
WORD_ROW -> τυχεῖν VERB     prob -> 9.961575636429551e-42
WORD_ROW -> · PUNCT     prob -> 7.0319855982484075e-43
WORD_ROW -> ὄμβρος NOUN     prob -> 2.559161907362648e-05
WORD_ROW -> δ̓ PART     prob -> 4.6325644254302794e-07
WORD_ROW -> ἀπ̓ ADP     prob -> 4.082108893839203e-10
WORD_ROW -> εὐνάοντος ADJ     prob -> 2

TRAIN AND DECODING LATIN

In [113]:
sentences_latin, tagset_latin, probEmission_latin, probTransition_latin, statistics_latin = train("latin",True);


In [114]:
print("==================== TEST ON LATIN ================================== ")
backtrace_latin, probabilities_latin = viterbi(sentences_latin[0], tagset_latin, probEmission_latin, probTransition_latin, 4, statistics_latin);
printPosTag(sentences_latin[0], tagset_latin, backtrace_latin, probabilities_latin);

WORD_ROW -> + PUNCT     prob -> 0.0477907757517123
WORD_ROW -> In ADP     prob -> 7.95175730713278e-05
WORD_ROW -> Dei PROPN     prob -> 6.730600719867964e-07
WORD_ROW -> nomine NOUN     prob -> 3.957304376743928e-09
WORD_ROW -> regnante VERB     prob -> 9.154634916135536e-12
WORD_ROW -> domno NOUN     prob -> 1.1573354564099308e-14
WORD_ROW -> nostro DET     prob -> 6.188509865711393e-17
WORD_ROW -> Carulo PROPN     prob -> 5.469200602474012e-20
WORD_ROW -> rege NOUN     prob -> 7.500897305500222e-23
WORD_ROW -> Francorum NOUN     prob -> 2.1081007115471513e-26
WORD_ROW -> et CCONJ     prob -> 1.0864475686704603e-27
WORD_ROW -> Langobardorum NOUN     prob -> 6.444268799442303e-31
WORD_ROW -> , PUNCT     prob -> 7.124095018424678e-32
WORD_ROW -> anno NOUN     prob -> 2.2636869354322035e-34
WORD_ROW -> regni NOUN     prob -> 1.101864558019235e-37
WORD_ROW -> eius DET     prob -> 9.125503370060112e-40
WORD_ROW -> quo PRON     prob -> 2.715982358959434e-43
WORD_ROW -> coepit VERB     prob

In [115]:
def calculateAccuracy(language, tagset, probEmission, probTransition, smoothingType, statistics):
    sentences_test = readFile(language,"test");
    count_tag_correct = 0;
    count_total_tag = 0;
    for sentence in sentences_test:
        backtrace_test, probabilities_test = viterbi(sentence, tagset, probEmission, probTransition, smoothingType, statistics);
         #Get real tag
        i = 0;
        for token in sentence:
            real_tag = token["upos"];
            if tagset.index(real_tag) == backtrace_test[i]:
                count_tag_correct = count_tag_correct + 1;
            count_total_tag = count_total_tag + 1;
            i = i + 1;
    return count_tag_correct/count_total_tag;

CALCULATE ACCURACY ON TEST SET OF GREEK

In [116]:
accuracy_greek = calculateAccuracy("greek", tagset_greek, probEmission_greek, probTransition_greek, 1, statistics_greek);
print("ACCURACY GREEK  " + str(accuracy_greek))


ACCURACY GREEK  0.20745264564149052


CALCULATE ACCURACY ON TEST SET OF LATIN

In [117]:
accuracy_latin = calculateAccuracy("latin", tagset_latin, probEmission_latin, probTransition_latin, 4, statistics_latin);
print("ACCURACY LATIN " + str(accuracy_latin))

ACCURACY LATIN 0.9436023090659911
