# IMPLEMENT HIDDEN MARKOW MODEL FOR GREEK AND LATIN POS TAGGER

In [740]:
import numpy as np
import os
from conllu import parse
import math
import nltk

### INIT TAG SETS

In [741]:
tags_g = ['ADJ','ADP','ADV','CCONJ','DET','INTJ','NOUN','NUM','PART','PRON','PUNCT','SCONJ','VERB','X'];
tags_l = ['ADJ','ADP','ADV','AUX','CCONJ','DET','NOUN','NUM','PART','PRON','PROPN','PUNCT','SCONJ','VERB','X'];

## FUNCTION TO CALCULATE OCCURENCE OF TAG AND WORD


In [742]:
def count(word, tag, tagset, count_words, count_tag):
    index_of_tag = tagset.index(tag);
    totIndex = len(tagset);
    # update count of tag
    count_tag[index_of_tag] = count_tag[index_of_tag] + 1;
    # update count of word
    if(word in count_words.keys()):
        count_words[word][index_of_tag] = count_words[word][index_of_tag] + 1;
    else:
        word_row = np.zeros(len(tagset) + 1, dtype=int);
        word_row[index_of_tag] = 1;
        count_words[word] = word_row;
    # update total count of word
    count_words[word][totIndex] = count_words[word][totIndex] + 1;
    # update total count of tag
    count_tag[totIndex] = count_tag[totIndex] + 1;
    return count_words, count_tag;

## CALCULATE EMISSION PROBABILITY

In [743]:
def calculateEmissionProbability(word, tag, tagset, count_words, count_tag ,probEmission):
    index_of_tag = tagset.index(tag);
    # index for total count
    totIndex = len(tagset);
    #if word exist then update count
    if(word in probEmission.keys()):
        probEmission[word][index_of_tag] = count_words[word][index_of_tag] / count_tag[index_of_tag];

    #if NTOT word exist then create row
    else:
        prob_row = np.zeros(len(tagset) + 1);
        prob_row[index_of_tag] = count_words[word][index_of_tag] / count_tag[index_of_tag];
        probEmission[word] = prob_row;
    return probEmission;

## CALCULATE TRANSITION PROBABILITY

In [744]:
def calculateTransictionProbability(word, prev_tag, tag, tagset, transition, count_tag, nSentences, probTransition):
    #natural sequence when scan senteces
    trans_tag = "%s_%s" % (prev_tag,tag);
    #Sequence used to saved
    trans_tag_real = "%s_%s" % (tag,prev_tag);
    
    if(trans_tag in transition.keys()):
        transition[trans_tag] = transition[trans_tag] + 1;
    else:
        transition[trans_tag] = 1;

    #When tag is Q0 (start) the calculate probTransistion with number of sentences
    #Else use normal Transition probability
    if(trans_tag in probTransition.keys() and prev_tag != 'Q0'):
        index_of_tag = tagset.index(prev_tag);
        probTransition[trans_tag_real] = transition[trans_tag] /count_tag[index_of_tag];
    else:
        probTransition[trans_tag_real] = transition[trans_tag]/ nSentences;

    return tag, probTransition;

In [745]:
def readFile(language,fileType):
    nameFile = "./corpus/%s/data_%s.conllu" % (language,fileType);
    tsv_file = open(nameFile,"r",encoding="utf-8").read();
    sentences = parse(tsv_file)
    return sentences;

# FUNCTION TO TRAIN

In [746]:
def train(language,statisticsIsEnabled,smoothLemmaIsEnabled):
    #select tag sets of language choosen
    if language == "greek":
        tagset = tags_g;
    elif language == "latin":
        tagset = tags_l;
    else:
        raise Exception("Language not found!");

    # INIT DATA STRUCTURE
    count_words = dict();
    count_tag = np.zeros(len(tagset) + 1, dtype = int);
    probEmission = dict();
    probTransition = dict();
    transition = dict();
    statistics = np.zeros(len(tagset));

    # DATAS FOR USE LEMMA
    count_words_lemmas = dict();
    count_tag_lemmas = np.zeros(len(tagset) + 1, dtype = int);
    probEmissionLemmas = dict();

    #load statistics
    if statisticsIsEnabled == True:
        statistics = calculateStatisticPosTagging(language)
    
    #Count number of sentence for calculate probTransistio with tag start Q0
    nSentences = 0;

    sentences = readFile(language,"train")
    for sentence in sentences:
        prev_tag = 'Q0';
        nSentences = nSentences + 1;
        for token in sentence:
            word = token["form"];
            tag = token["upos"];
            count_words, count_tag = count(word, tag, tagset, count_words, count_tag);
            probEmission = calculateEmissionProbability(word,tag,tagset, count_words, count_tag, probEmission);
            if smoothLemmaIsEnabled == True:
                lemma = token["lemma"]
                count_words_lemmas, count_tag_lemmas = count(lemma, tag, tagset, count_words_lemmas, count_tag_lemmas);
                probEmissionLemmas = calculateEmissionProbability(lemma,tag,tagset, count_words_lemmas, count_tag_lemmas, probEmissionLemmas);

            prev_tag, probTransition = calculateTransictionProbability(word, prev_tag, tag, tagset, transition, count_tag, nSentences, probTransition)
    return sentences,tagset, probEmission, probTransition,statistics,probEmissionLemmas;

In [747]:
def tokenize_sentence(sentence):
    words = [];
    for token in sentence:
        words.append(token["form"]);
    return words

### CALCULATE STATISTIC POS TAGGIN (SMOOTHING)

In [748]:
def calculateStatisticPosTagging(language):
    #select tag sets of language choosen
    if language == "greek":
        tagset = tags_g;
    elif language == "latin":
        tagset = tags_l;
    else:
        raise Exception("Language not found!");

    count_words = dict();
    count_tag = np.zeros(len(tagset) + 1, dtype = int);
    count_tag_one_occured = np.zeros(len(tagset), dtype = int);
    count_tag_one_occured_total = 0;

    statistics = np.zeros(len(tagset));
    

    sentences = readFile(language,"dev");
    for sentence in sentences:
        for token in sentence:
            word = token["form"];
            tag = token["upos"];
            count_words, count_tag = count(word, tag, tagset, count_words, count_tag);

    totIndex = len(tagset);
    for word in count_words:
        if count_words[word][totIndex] == 1:
            index_of_tag = np.argmax(count_words[word]);
            count_tag_one_occured[index_of_tag] = count_tag_one_occured[index_of_tag] + 1;
            count_tag_one_occured_total =  count_tag_one_occured_total + 1;

    for tag in tagset:
        index_of_tag = tagset.index(tag);
        statistics[index_of_tag] = count_tag_one_occured[index_of_tag] / count_tag_one_occured_total;
    return statistics;

## IMPLEMENT SMOOTHING

In [749]:

def selectSmoothing(type, probEmission, word, index_of_tag, tagset, statistics, probEmissionLemma):
    # NO SMOOTHING
    if word in probEmission.keys():
       return probEmission[word][index_of_tag]
    elif type == 0 and word not in probEmission.keys():
        return 0.00001;
    # IF WORD NOT EXIST THEN SET NOUN PROBABILITY 1
    elif type == 1 and word not in probEmission.keys():
        prob_row = np.zeros(len(tagset) + 1);
        index_of_noun = tagset.index("NOUN");
        prob_row[index_of_noun] = 1;
        return prob_row[index_of_tag];
    # IF WORD NOT EXIST THEN SET NOUN AND VERB EQUI PROBABILITY (0.5)
    elif type == 2 and word not in probEmission.keys():
        prob_row = np.zeros(len(tagset) + 1);
        index_of_noun = tagset.index("NOUN");
        index_of_verb = tagset.index("VERB");
        prob_row[index_of_noun] = 0.5;
        prob_row[index_of_verb] = 0.5;
        return prob_row[index_of_tag];
    # IF WORD NOT EXIST THEN EVERY TAG HAS PROBABILITY 1/#(pos_tags)
    elif type == 3 and word not in probEmission.keys():
        unk_prob = 1 / len(tagset);
        prob_row = np.full(len(tagset) + 1, unk_prob);
        return prob_row[index_of_tag];
    # IF WORD NOT EXIST THEN EVERY TAG HAS POS TAGGER STATISTICS FOR EVERY WORD IN DEV THAT HAS 1 OCCURENCE
    elif type == 4 and word not in probEmission.keys():
        return statistics[index_of_tag];
    elif type == 5 and word in probEmissionLemma.keys():
        word = ntlk.lemmatize(word);
        return probEmissionLemma[word][index_of_tag]
    else:
        return 0.00001;

# DECODING WITH VITERBI

In [750]:
def viterbi(sentence, tagset, probEmission, probTransition, smoothingType, statistics, probEmissionLemmas):
    words = tokenize_sentence(sentence);
    start_tag = "Q0";
    viterbi_matrix = np.zeros((len(tagset),len(words)));
    backtrace = np.zeros(len(words), dtype = int);
    probabilites = np.zeros(len(words));
    t = 0;
   
    for word in words:
        # Calculate viterbi column for every tag possible
        for tag in tagset:
            index_of_tag = tagset.index(tag);
            #Get Emission probabilty of word ( HERE WHEN CAN APPLY SMOOTHING)
            probE = selectSmoothing(smoothingType, probEmission, word, index_of_tag, tagset, statistics, probEmissionLemmas);

            if probE == 0:
                probE = np.log(0.00001)
            else:
                probE = np.log(probE)

            #Run first iteration of viterbi to initialize first column
            if t == 0:
                tran_tag = "%s_%s" % (tag,start_tag);   
                probT = np.log(0.00001);
                if tran_tag in probTransition.keys():
                    probT = np.log(probTransition[tran_tag]);
                viterbi_matrix[index_of_tag][t] = probE + probT;
            else:
                max_tmp = np.zeros(len(tagset));
                for i in range(0,len(tagset)):
                    tran_tag = "%s_%s" % (tag,tagset[i]);
                    probT = np.log(0.00001);
                    if tran_tag in probTransition.keys():
                        probT = np.log(probTransition[tran_tag]);

                    max_tmp[i] = viterbi_matrix[i,t-1] + probT;
                viterbi_matrix[index_of_tag,t] = np.amax(max_tmp) + probE;

        index_max_values = np.argmax(viterbi_matrix[:,t]);  
        backtrace[t] = index_max_values;
        probabilites[t] = viterbi_matrix[index_max_values,t];
        t= t +1;
    return backtrace,probabilites;

In [751]:
def printPosTag(sentence, tagset, backtrace , probabilities):
    i = 0;
    words = tokenize_sentence(sentence);
    for word in words:
        print("WORD_ROW -> " + word + " " + tagset[backtrace[i]] + "     prob -> " + str(probabilities[i]))
        i = i + 1;

## TRAIN AND DECODING GREEK

In [752]:
SMOOTHING_TYPE = 0;
LOAD_STATISTIC_POS = False;
LOAD_EMISSION_PROB_LEMMAS = False;

sentences_greek, tagset_greek, probEmission_greek, probTransition_greek, statistics_greek, probEmissionLemmas_greek  =  train("greek", LOAD_STATISTIC_POS, LOAD_EMISSION_PROB_LEMMAS);

In [753]:
print("==================== TEST ON GREEK ================================== ")
backtrace_greek, probabilities_greek = viterbi(sentences_greek[0], tagset_greek, probEmission_greek, probTransition_greek, SMOOTHING_TYPE, statistics_greek, probEmissionLemmas_greek);
printPosTag(sentences_greek[0], tagset_greek, backtrace_greek, probabilities_greek);

WORD_ROW -> ἐρᾷ VERB     prob -> -9.268481272273874
WORD_ROW -> μὲν PART     prob -> -15.012261505759101
WORD_ROW -> ἁγνὸς ADJ     prob -> -25.780646354223645
WORD_ROW -> οὐρανὸς NOUN     prob -> -35.304979691779565
WORD_ROW -> τρῶσαι VERB     prob -> -37.33968533961801
WORD_ROW -> χθόνα NOUN     prob -> -46.42668418324466
WORD_ROW -> , PUNCT     prob -> -48.80483279303211
WORD_ROW -> ἔρως NOUN     prob -> -59.96014696849298
WORD_ROW -> δὲ PART     prob -> -64.59106304839658
WORD_ROW -> γαῖαν NOUN     prob -> -72.5121179919485
WORD_ROW -> λαμβάνει VERB     prob -> -74.95228874789511
WORD_ROW -> γάμου NOUN     prob -> -85.04102125627635
WORD_ROW -> τυχεῖν VERB     prob -> -94.40983865023648
WORD_ROW -> · PUNCT     prob -> -97.06068988639385


## TRAIN AND DECODING LATIN

In [754]:
sentences_latin, tagset_latin, probEmission_latin, probTransition_latin, statistics_latin,probEmissionLemmas_latin = train("latin",LOAD_STATISTIC_POS, LOAD_EMISSION_PROB_LEMMAS);

In [755]:
print("==================== TEST ON LATIN ================================== ")
backtrace_latin, probabilities_latin = viterbi(sentences_latin[0], tagset_latin, probEmission_latin, probTransition_latin, SMOOTHING_TYPE, statistics_latin,probEmissionLemmas_latin);
printPosTag(sentences_latin[0], tagset_latin, backtrace_latin, probabilities_latin);

WORD_ROW -> + PUNCT     prob -> -3.0409226340125732
WORD_ROW -> In ADP     prob -> -9.439532515808455
WORD_ROW -> Dei PROPN     prob -> -14.211431251275023
WORD_ROW -> nomine NOUN     prob -> -19.347702756413725
WORD_ROW -> regnante VERB     prob -> -25.41676081668182
WORD_ROW -> domno NOUN     prob -> -32.09007095933274
WORD_ROW -> nostro DET     prob -> -37.32125225570135
WORD_ROW -> Carulo PROPN     prob -> -44.352569396270525
WORD_ROW -> rege NOUN     prob -> -50.94443448474381
WORD_ROW -> Francorum NOUN     prob -> -59.12142501243105
WORD_ROW -> et CCONJ     prob -> -62.08688424838046
WORD_ROW -> Langobardorum NOUN     prob -> -69.5169467052356
WORD_ROW -> , PUNCT     prob -> -71.71924027267977
WORD_ROW -> anno NOUN     prob -> -77.47089829042083
WORD_ROW -> regni NOUN     prob -> -85.09864464321117
WORD_ROW -> eius DET     prob -> -89.89233065797255
WORD_ROW -> quo PRON     prob -> -98.0120052842971
WORD_ROW -> coepit VERB     prob -> -103.88455098746397
WORD_ROW -> Langobardiam 

In [756]:
def calculateAccuracy(language, tagset, probEmission, probTransition, smoothingType, statistics):
    probEmissionLemmas = dict();
    accuracy = 0;
    sentences_test = readFile(language,"test");
    count_tag_correct = 0;
    count_total_tag = 0;
    errors = dict();
    for sentence in sentences_test:
        backtrace_test, probabilities_test = viterbi(sentence, tagset, probEmission, probTransition, smoothingType, statistics, probEmissionLemmas);
         #Get real tag
        i = 0;
        for token in sentence:
            real_tag = token["upos"];
            if tagset.index(real_tag) == backtrace_test[i]:
                count_tag_correct = count_tag_correct + 1;
            else:
                errors[token["form"]] = [real_tag,tagset[backtrace_test[i]]]
            count_total_tag = count_total_tag + 1;
            i = i + 1;
    accuracy = count_tag_correct/count_total_tag
    return accuracy,errors;

### CALCULATE ACCURACY ON TEST SET OF GREEK

In [757]:
accuracy_greek,errors_greek = calculateAccuracy("greek", tagset_greek, probEmission_greek, probTransition_greek, SMOOTHING_TYPE, statistics_greek);
print("ACCURACY GREEK  " + str(accuracy_greek))


ACCURACY GREEK  0.729090128345818


### CALCULATE ACCURACY ON TEST SET OF LATIN

In [758]:
accuracy_latin,errors_latin = calculateAccuracy("latin", tagset_latin, probEmission_latin, probTransition_latin, SMOOTHING_TYPE, statistics_latin);
print("ACCURACY LATIN " + str(accuracy_latin))

ACCURACY LATIN 0.9552722289131609


In [759]:
def analizeErrors(errors,tagset):
    probabilites_error = dict();
    for word in errors:
        real_tag = errors[word][0]
        estimate_tag = errors[word][1]
        index_estimated_tag = tagset.index(estimate_tag);
        if real_tag in probabilites_error.keys():
            probabilites_error[real_tag][index_estimated_tag] = probabilites_error[real_tag][index_estimated_tag] + 1;
        else:
            prob_error = np.zeros(len(tagset), dtype = int);
            prob_error[index_estimated_tag] = 1;
            probabilites_error[real_tag] = prob_error;
    return probabilites_error

### Analize erros

In [760]:
analizeErrors(errors_greek,tagset_greek)

{'VERB': array([  1,   0, 123,   0,   0,   0, 413,   0, 124,   0, 296,   0,   0,
          0]),
 'ADV': array([ 3,  4,  0, 11,  0,  0, 10,  0, 37,  2, 17,  3, 44,  0]),
 'PRON': array([99,  0,  7,  0,  0,  0, 13,  0,  2,  0,  6,  0, 11,  1]),
 'NOUN': array([ 33,   1, 120,   0,   0,   0,   0,   0,  65,   1, 240,   0, 549,
          0]),
 'ADJ': array([  0,   0,  33,   0,   0,   0, 329,  13,  30,  25, 131,   0, 238,
          0]),
 'DET': array([ 0,  0,  0,  0,  0,  0,  0,  0,  0, 20,  0,  0,  1,  1]),
 'ADP': array([0, 0, 5, 0, 0, 0, 1, 0, 1, 0, 1, 0, 2, 0]),
 'SCONJ': array([ 0,  1, 11,  1,  0,  0,  0,  0,  1,  0,  0,  0,  1,  0]),
 'NUM': array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]),
 'CCONJ': array([0, 0, 6, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0]),
 'PUNCT': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0]),
 'INTJ': array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0]),
 'X': array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0])}

In [761]:
analizeErrors(errors_latin,tagset_latin)

{'VERB': array([ 0,  2,  0,  7,  0,  0, 47,  0,  0,  0,  5, 40,  0,  0,  0]),
 'PROPN': array([  5,   0,   0,   0,   0,   0, 120,   0,   0,   0,   0, 107,   0,
          8,   1]),
 'ADV': array([0, 1, 0, 0, 4, 0, 3, 1, 0, 0, 0, 1, 1, 0, 0]),
 'DET': array([0, 2, 0, 0, 0, 0, 4, 0, 0, 6, 2, 0, 0, 0, 0]),
 'NOUN': array([ 7,  0,  0,  0,  0,  0,  0,  0,  0,  0,  4, 17,  0,  7,  0]),
 'ADP': array([1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0]),
 'CCONJ': array([0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 2, 0, 0]),
 'ADJ': array([ 0,  1,  0,  0,  0,  0, 27,  0,  0,  0,  1, 15,  0,  2,  0]),
 'NUM': array([0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0]),
 'AUX': array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 4, 0]),
 'SCONJ': array([0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]),
 'PRON': array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])}