Exercise 8.1: HMM modeling of sentences.

In [1]:
import numpy as np
import nltk
import hmmlearn, hmmlearn.hmm

In [2]:
temp_file = open('hmm_sentences.txt', 'r', encoding='utf-8', errors='ignore')
temp = temp_file.readlines()

In [3]:
# delete the \n at the end of each sentence
sentences = []
for i in range(len(temp)):
    sentences.append(temp[i][:-1])

In [4]:
mycrawled_nltktexts=[]
for k in range(len(sentences)):    
    temp_tokenizedtext=nltk.word_tokenize(sentences[k])
    temp_nltktext=nltk.Text(temp_tokenizedtext)
    mycrawled_nltktexts.append(temp_nltktext)


In [5]:
#%% Make all crawled texts lowercase
mycrawled_lowercasetexts=[]
for k in range(len(mycrawled_nltktexts)):    
    temp_lowercasetext=[]
    for l in range(len(mycrawled_nltktexts[k])):
        lowercaseword=mycrawled_nltktexts[k][l].lower()
        temp_lowercasetext.append(lowercaseword)
    temp_lowercasetest=nltk.Text(temp_lowercasetext)
    mycrawled_lowercasetexts.append(temp_lowercasetext)

In [6]:
#%% Convert a POS tag for WordNet
def tagtowordnet(postag):
    wordnettag=-1
    if postag[0]=='N':
        wordnettag='n'
    elif postag[0]=='V':
        wordnettag='v'
    elif postag[0]=='J':
        wordnettag='a'
    elif postag[0]=='R':
        wordnettag='r'
    return(wordnettag)

In [7]:
#%% POS tag and lemmatize the loaded texts
# Download tagger and wordnet resources if you do not have them already
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
lemmatizer=nltk.stem.WordNetLemmatizer()

def lemmatizetext(nltktexttolemmatize):
    # Tag the text with POS tags
    taggedtext=nltk.pos_tag(nltktexttolemmatize)
    # Lemmatize each word text
    lemmatizedtext=[]
    for l in range(len(taggedtext)):
        # Lemmatize a word using the WordNet converted POS tag
        wordtolemmatize=taggedtext[l][0]
        wordnettag=tagtowordnet(taggedtext[l][1])
        if wordnettag!=-1:
            lemmatizedword=lemmatizer.lemmatize(wordtolemmatize,wordnettag)
        else:
            lemmatizedword=wordtolemmatize
        # Store the lemmatized word
        lemmatizedtext.append(lemmatizedword)
    return(lemmatizedtext)

mycrawled_lemmatizedtexts=[]
for k in range(len(mycrawled_lowercasetexts)):
    lemmatizedtext=lemmatizetext(mycrawled_lowercasetexts[k])
    lemmatizedtext=nltk.Text(lemmatizedtext)
    mycrawled_lemmatizedtexts.append(lemmatizedtext)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
#%% Find the vocabulary, in a distributed fashion
import numpy
myvocabularies=[]
myindices_in_vocabularies=[]
# Find the vocabulary of each document
for k in range(len(mycrawled_lemmatizedtexts)):
    # Get unique words and where they occur
    temptext=mycrawled_lemmatizedtexts[k]
    uniqueresults=numpy.unique(temptext,return_inverse=True)
    uniquewords=uniqueresults[0]
    wordindices=uniqueresults[1]
    # Store the vocabulary and indices of document words in it
    myvocabularies.append(uniquewords)
    myindices_in_vocabularies.append(wordindices)

In [9]:
# Unify the vocabularies.
# First concatenate all vocabularies
tempvocabulary=[]  
for k in range(len(mycrawled_lemmatizedtexts)):
    tempvocabulary.extend(myvocabularies[k])
# Find the unique elements among all vocabularies
uniqueresults=numpy.unique(tempvocabulary,return_inverse=True)
unifiedvocabulary=uniqueresults[0]
wordindices=uniqueresults[1]
# Translate previous indices to the unified vocabulary.
# Must keep track where each vocabulary started in 
# the concatenated one.
vocabularystart=0
myindices_in_unifiedvocabulary=[]
for k in range(len(mycrawled_lemmatizedtexts)):
    # In order to shift word indices, we must temporarily
    # change their data type to a Numpy array
    tempindices=numpy.array(myindices_in_vocabularies[k])
    tempindices=tempindices+vocabularystart
    tempindices=wordindices[tempindices]
    myindices_in_unifiedvocabulary.append(tempindices)
    vocabularystart=vocabularystart+len(myvocabularies[k])

In [10]:
# HMMlearn expects the data to be provided 
# as a (nsamples,1) 2D-array, where the 2nd dimension has just
# one element, containing list of indices into a vocabulary, 
# all documents concatenated together, and separately a 
# list of lenghts of the individual documents.
# Create concatenated index list from previously
# crawled and processed documents
concatenated_data=[]
documentlengths=[]
for k in range(len(myindices_in_unifiedvocabulary)):
    concatenated_data.extend(myindices_in_unifiedvocabulary[k])
    documentlengths.append(len(myindices_in_unifiedvocabulary[k]))
concatenated_data=numpy.matrix(concatenated_data).T


In [11]:
# Fit the model
myhmm = hmmlearn.hmm.MultinomialHMM(n_components=5, n_iter=100, verbose=True)
myhmm_fitted=myhmm.fit(concatenated_data,lengths=documentlengths)


         1      -67782.7098             +nan
         2      -60292.7918       +7489.9180
         3      -60247.3989         +45.3928
         4      -60179.1843         +68.2146
         5      -60074.1303        +105.0540
         6      -59923.7325        +150.3978
         7      -59727.5595        +196.1730
         8      -59481.0719        +246.4876
         9      -59153.9201        +327.1518
        10      -58682.3186        +471.6014
        11      -57996.1625        +686.1561
        12      -57081.3557        +914.8069
        13      -55967.5975       +1113.7582
        14      -54613.9714       +1353.6261
        15      -53179.1325       +1434.8390
        16      -52025.4411       +1153.6913
        17      -51097.6607        +927.7804
        18      -50331.6257        +766.0350
        19      -49712.3941        +619.2316
        20      -49235.2382        +477.1559
        21      -48963.8771        +271.3610
        22      -48855.2990        +108.5782
        23

In [12]:
# Inspect start, transition, and emission probabilities
myhmm_fitted.startprob_

array([1.58298330e-231, 1.03045362e-116, 1.10975106e-047, 1.00000000e+000,
       0.00000000e+000])

In [13]:
myhmm_fitted.emissionprob_

array([[0.00000000e+000, 3.16230537e-071, 8.59669574e-047,
        0.00000000e+000, 3.21227272e-059, 3.66364105e-177,
        2.60590869e-184, 6.43284276e-014, 5.74703531e-156,
        4.95392442e-001, 3.11665218e-224, 9.45052505e-151,
        3.43117982e-168, 4.55711753e-154, 6.57243970e-176,
        8.85430687e-179, 1.57565813e-160, 3.67327542e-155,
        4.62774316e-173, 1.82866865e-277, 2.70163457e-173,
        2.10805294e-002, 1.77866966e-002, 0.00000000e+000,
        3.00397544e-001, 2.79824285e-291, 8.25528004e-169,
        1.22468639e-260, 3.55733706e-002, 9.71418258e-157,
        4.68337098e-279, 3.72156853e-169, 3.35893403e-171,
        0.00000000e+000, 1.97553939e-002, 1.10014013e-001,
        6.44963235e-283, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 1.56868344e-216, 5.41001669e-173,
        1.07016626e-008],
       [4.26936750e-200, 1.98038784e-023, 3.02420346e-015,
        1.45578564e-029, 2.87369777e-040, 6.77135207e-002,
        3.89802733e-002, 6.125

In [14]:
myhmm_fitted.transmat_

array([[3.62103648e-029, 2.95608160e-096, 1.48391854e-035,
        3.68909265e-001, 6.31090735e-001],
       [3.12310956e-002, 1.01540558e-001, 1.69066158e-001,
        1.71529079e-016, 6.98162189e-001],
       [2.08789934e-165, 7.77757317e-001, 2.22242683e-001,
        2.64112832e-020, 5.40701830e-017],
       [0.00000000e+000, 8.51201896e-030, 1.00000000e+000,
        1.30860756e-024, 1.44804979e-070],
       [4.70325988e-001, 1.80864491e-083, 1.48377176e-043,
        5.29674012e-001, 2.00633599e-040]])

In [15]:
np.sum(myhmm_fitted.transmat_, axis=1)

array([1., 1., 1., 1., 1.])

Yes, the states seem to correspond to meaningful properties of the
simplified language since the probability of the states seems to be distributed
uniformly.



Exercise 9.1: Inside-outside algorithm.
Use the inside-outside algorithm for the Chomsky normal form grammar to calculate the probability of the sentence "a wise fox can help the friendly insightful cat".
Report your computation and the resulting probability.

In [16]:
# Lowercase grammar rules
grammar = {
    's': [('stmany', 1.0)],
    'stmany': [('s1 .', 0.6), ('s1 , but stmany', 0.4)],
    's1': [('subj qverb1 qverb2 obj', 1.0)],
    'subj': [('article desc noun', 1.0)],
    'desc': [('adjective', 0.7), ('adjective desc', 0.3)],
    'obj': [('article desc noun', 1.0)],
    'qverb1': [('can', 0.2), ('will', 0.5), ('may', 0.3)],
    'article': [('a', 0.6), ('the', 0.4)],
    'qverb2': [('explain', 0.4), ('help', 0.2), ('answer', 0.4)],
    'adjective': [('wise', 0.3), ('friendly', 0.5), ('insightful', 0.2)],
    'noun': [('cat', 0.7), ('dog', 0.2), ('fox', 0.1)]
}

# Sentence
sentence = 'a wise fox can help the friendly insightful cat'

In [17]:
# Convert sentence to list of words
words = sentence.split()
print(words)
# Initialize inside and outside tables
inside = np.zeros((len(words), len(words)))
outside = np.zeros((len(words), len(words)))

['a', 'wise', 'fox', 'can', 'help', 'the', 'friendly', 'insightful', 'cat']


In [18]:
# Inside algorithm
for i in range(len(words)):
    for j in range(i, -1, -1):
        for rule in grammar:
            for expansion in grammar[rule]:
                if expansion[0] == words[j]:
                    inside[j][i] += expansion[1]
                elif len(expansion[0].split()) == 2:
                    for k in range(j, i):
                        inside[j][i] += expansion[1] * inside[j][k] * inside[k+1][i]

In [19]:
# Outside algorithm
outside[0][len(words)-1] = 1
for i in range(len(words)):
    for j in range(i, -1, -1):
        for rule in grammar:
            for expansion in grammar[rule]:
                if expansion[0] == words[j]:
                    outside[j][i] += expansion[1]
                elif len(expansion[0].split()) == 2:
                    for k in range(j, i):
                        outside[j][i] += expansion[1] * outside[j][k] * inside[k+1][i]

In [21]:
# Calculate probabilities and normalize
probabilities = np.zeros((len(words), len(words)))
for i in range(len(words)):
    for j in range(i, -1, -1):
        probabilities[j][i] = inside[j][i] * outside[j][i]
    total_prob = sum(probabilities[j][i] for j in range(i + 1) for i in range(len(words)))
    probabilities /= total_prob

print('P(' + ' '.join(words) + ') = ' + str(probabilities[0][len(words)-1]))

P(a wise fox can help the friendly insightful cat) = 0.8874280164911058
