## Carga del modelo HMM  previamente entrenado

In [1]:
# Cargamos las probabilidades del modelo HMM
import numpy as np
transitionProdbict = np.load('transitionHMM.npy', allow_pickle='True').item()
emissionProbdict = np.load('emissionHMM.npy', allow_pickle='True').item()


In [2]:
transitionProdbict

{'NOUN|DET': 0.7051234338171443,
 'PUNCT|NOUN': 0.21745022412175544,
 'ADP|PUNCT': 0.10090433127082342,
 'ADJ|ADP': 0.012436935351402088,
 'NOUN|ADJ': 0.21575826037842416,
 'ADP|NOUN': 0.32356926925883456,
 'SCONJ|ADP': 0.017834095975595446,
 'VERB|SCONJ': 0.09595070422535211,
 'VERB|VERB': 0.029147490685952224,
 'NUM|VERB': 0.020600482138943676,
 'NOUN|NUM': 0.5239130434782608,
 '_|NOUN': 0.05962681121651204,
 'ADP|_': 0.8613312202852615,
 'DET|ADP': 0.5141382142438109,
 'VERB|ADP': 0.057022175290390706,
 '_|VERB': 0.03550295857988166,
 'DET|NOUN': 0.014177004065464401,
 'CCONJ|NOUN': 0.04951527155217346,
 'ADJ|CCONJ': 0.08459422283356259,
 'ADP|ADJ': 0.24936458627506355,
 'NOUN|ADP': 0.19464977120732135,
 'ADJ|NOUN': 0.17043677681642866,
 'PUNCT|ADJ': 0.24795255577520475,
 'VERB|PUNCT': 0.06964937331429479,
 'ADP|VERB': 0.24479509094893712,
 'PRON|DET': 0.01786378861183476,
 'DET|PRON': 0.024761904761904763,
 'PROPN|DET': 0.12095273539263118,
 'PROPN|PROPN': 0.22665345553628932,
 'AD

In [3]:
stateSet = set([w.split('|')[1] for w in list(emissionProbdict.keys())])
stateSet

{'ADJ',
 'ADP',
 'ADV',
 'AUX',
 'CCONJ',
 'DET',
 'INTJ',
 'NOUN',
 'NUM',
 'PART',
 'PRON',
 'PROPN',
 'PUNCT',
 'SCONJ',
 'SYM',
 'VERB',
 '_'}

In [4]:
tagStateDict = {}
for i, state in enumerate(stateSet):
    tagStateDict[state] = i
    
tagStateDict

{'ADJ': 0,
 'NUM': 1,
 'PUNCT': 2,
 '_': 3,
 'PRON': 4,
 'PART': 5,
 'SYM': 6,
 'DET': 7,
 'ADP': 8,
 'SCONJ': 9,
 'INTJ': 10,
 'ADV': 11,
 'NOUN': 12,
 'CCONJ': 13,
 'PROPN': 14,
 'AUX': 15,
 'VERB': 16}

## Distribucion inicial de estados latentes

In [5]:
initTagStateProb = {} # \rho_i^{(0)}
from conllu import parse_incr 
wordList = []
data_file = open("UD_Spanish-AnCora/es_ancora-ud-dev.conllu", "r", encoding="utf-8")
count = 0 # cuenta la longitud del corpus
for tokenlist in parse_incr(data_file):
  count += 1
  tag = tokenlist[0]['upos']
  if tag in initTagStateProb.keys():
    initTagStateProb[tag] += 1
  else:
    initTagStateProb[tag] = 1

for key in initTagStateProb.keys():
  initTagStateProb[key] /= count

initTagStateProb

{'DET': 0.36275695284159615,
 'PROPN': 0.1124546553808948,
 'ADP': 0.15538089480048367,
 'PRON': 0.06348246674727932,
 'SCONJ': 0.02418379685610641,
 'ADV': 0.056831922611850064,
 'PUNCT': 0.08222490931076179,
 'VERB': 0.02418379685610641,
 'ADJ': 0.010882708585247884,
 'CCONJ': 0.032648125755743655,
 'NOUN': 0.02720677146311971,
 '_': 0.009068923821039904,
 'INTJ': 0.0006045949214026602,
 'AUX': 0.016324062877871828,
 'NUM': 0.01995163240628779,
 'PART': 0.0018137847642079807}

In [6]:
np.array([initTagStateProb[k] for k in initTagStateProb.keys()]).sum()

np.float64(1.0)

## Construccion del modelo de viterbi

In [7]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
def ViterbiMatrix(secuencia, transitionProdbict = transitionProdbict, emissionProbdict = emissionProbdict, tagStateDict = tagStateDict, initTagStateProb = initTagStateProb):
    
    seq = word_tokenize(secuencia)
    viterbiProb = np.zeros((17, len(seq))) # matriz de probabilidades
    
    for key in tagStateDict.keys():
        tag_row = tagStateDict[key]
        word_tag = seq[0].lower() + '|' + key
        if word_tag in emissionProbdict.keys():
            viterbiProb[tag_row][0] = initTagStateProb[key] * emissionProbdict[word_tag]
            
    # computo prob. siguientes columnas
    for col in range(1, len(seq)):
        for key in tagStateDict.keys():
            tag_row = tagStateDict[key]
            word_tag = seq[col].lower() + '|' + key   
            if word_tag in emissionProbdict.keys():
                possible_probs = []
                for key2 in tagStateDict.keys():
                    tag_row2 = tagStateDict[key2]
                    tag_prevtag = key + '|' + key2
                    if tag_prevtag in transitionProdbict.keys():
                        if viterbiProb[tag_row2][col-1] > 0:
                            possible_probs.append(viterbiProb[tag_row2, col-1]*transitionProdbict[tag_prevtag]*emissionProbdict[word_tag])
                viterbiProb[tag_row][col] = max(possible_probs)          
            
    return viterbiProb

matrix = ViterbiMatrix("el mundo es peque√±o")
matrix

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.48896690e-10],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.24339097e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 2.01067137e-04, 3.84542966e-10, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 6.84724476e