# Reading the Training File

In [76]:
train_file_path = '/home/parsa/Development/POS-tagger/POS_CORPUS_FOR_STUDENTS/POS_train.pos'
dev_file_path = '/home/parsa/Development/POS-tagger/POS_CORPUS_FOR_STUDENTS/POS_dev.words'
test_file_path = '/home/parsa/Development/POS-tagger/POS_CORPUS_FOR_STUDENTS/'

### Calculating Emission Probabilities

In this section we create a dictionary with all of the words in the training set as keys and a counter of every POS associated with each word as values.

In [None]:
from collections import Counter
import numpy as np


In [58]:
word_pos_dict = {}
pos_set = set()
with open(train_file_path, 'r') as file:
    for line in file:
        line = line.strip()
        if line:
            word, tag = line.split('\t')
            pos_set.add(tag)
            if word in word_pos_dict:
                if tag in word_pos_dict[word]:
                    word_pos_dict[word][tag] += 1
                else:
                    word_pos_dict[word][tag] = 1
            else:
                c = Counter()
                c[tag] = 1
                word_pos_dict[word] = c
                
                
pos_set = list(pos_set)
word_set = list(word_pos_dict)

In [59]:
word_pos_dict

{'In': Counter({'IN': 1735, 'RB': 1, 'NNP': 3, 'RBR': 1}),
 'an': Counter({'DT': 3142, ',': 1}),
 'Oct.': Counter({'NNP': 317, 'NN': 1}),
 '19': Counter({'CD': 100}),
 'review': Counter({'NN': 36, 'VB': 21, 'VBP': 1}),
 'of': Counter({'IN': 22925, 'RP': 2, 'RB': 2}),
 '``': Counter({'``': 6967}),
 'The': Counter({'DT': 6795, 'NNP': 37, 'VB': 1}),
 'Misanthrope': Counter({'NN': 3}),
 "''": Counter({"''": 6787}),
 'at': Counter({'IN': 4361, 'RP': 1}),
 'Chicago': Counter({'NNP': 197}),
 "'s": Counter({'POS': 8079, 'VBZ': 1222, 'PRP': 8, 'NNP': 1, 'NNS': 1}),
 'Goodman': Counter({'NNP': 7}),
 'Theatre': Counter({'NNP': 5}),
 '(': Counter({'(': 1153}),
 'Revitalized': Counter({'VBN': 1}),
 'Classics': Counter({'NNS': 1}),
 'Take': Counter({'VBP': 1, 'VB': 8}),
 'the': Counter({'DT': 41098, 'VBP': 1, 'NNP': 5, 'NN': 1, 'JJ': 2}),
 'Stage': Counter({'NN': 1, 'NNP': 2}),
 'in': Counter({'IN': 14957, 'RP': 173, 'RB': 53, 'FW': 2, 'RBR': 1}),
 'Windy': Counter({'NNP': 1}),
 'City': Counter({'NN

In [71]:
num_words = len(word_set)
num_pos = len(pos_set)

emission_probs = np.ndarray(shape=(num_pos,num_words))

for pi in range(num_pos):
    for wi in range(num_words):
        
        word = word_set[wi]
        pos = pos_set[pi]

        word_poses = word_pos_dict[word]

        total = sum(word_poses.values())
            
        if pos in word_poses:
            emission_probs[pi,wi] = word_poses[pos]/total
        else:
            emission_probs[pi,wi] = 0
            
            
        

In [72]:
emission_probs

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [73]:
lines = []
with open(train_file_path, 'r') as file:
    line_buf = []
    for line in file:
        line = line.strip()
        if line:
            _, tag = line.split('\t')
            line_buf.append(tag)

        else:
            lines.append(line_buf.copy())
            line_buf = []


pos_successor_dict = {}
for line in lines:
    for i in range(len(line)):
        if i < len(line)-1:
            next_pos = line[i+1]
            pos = line[i]
            if pos in pos_successor_dict:
                c = pos_successor_dict[pos]
                if next_pos in d:
                    c[next_pos] += 1
                else:
                    c[next_pos] = 1
            else:
                c = Counter()
                c[next_pos] = 1
                pos_successor_dict[pos] = c

                
transiton_probs = np.ndarray(shape=(num_pos, num_pos))
for i in range(num_pos):
    
    pos = pos_set[i]
    next_poses = pos_successor_dict[pos]
    total = sum(next_poses.values())
    
    for j in range(num_pos):
        
        next_pos = pos_set[j]
        if next_pos in next_poses:
            transiton_probs[i,j] = next_poses[next_pos]/total
        else:
            transiton_probs[i,j] = 0

In [74]:
transiton_probs

array([[0.01034483, 0.        , 0.00193966, ..., 0.00474138, 0.        ,
        0.0075431 ],
       [0.        , 0.        , 0.00443459, ..., 0.        , 0.        ,
        0.        ],
       [0.00238347, 0.000227  , 0.00491828, ..., 0.00533444, 0.0001135 ,
        0.000454  ],
       ...,
       [0.        , 0.        , 0.00186654, ..., 0.00046664, 0.        ,
        0.00046664],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.00814996, ..., 0.        , 0.        ,
        0.        ]])

In [84]:
def viterbi(y, A, B, Pi=None):
    """
    Return the MAP estimate of state trajectory of Hidden Markov Model.

    Parameters
    ----------
    y : array (T,)
        Observation state sequence. int dtype.
    A : array (K, K)
        State transition matrix. See HiddenMarkovModel.state_transition  for
        details.
    B : array (K, M)
        Emission matrix. See HiddenMarkovModel.emission for details.
    Pi: optional, (K,)
        Initial state probabilities: Pi[i] is the probability x[0] == i. If
        None, uniform initial distribution is assumed (Pi[:] == 1/K).

    Returns
    -------
    x : array (T,)
        Maximum a posteriori probability estimate of hidden state trajectory,
        conditioned on observation sequence y under the model parameters A, B,
        Pi.
    T1: array (K, T)
        the probability of the most likely path so far
    T2: array (K, T)
        the x_j-1 of the most likely path so far
    """
    # Cardinality of the state space
    K = A.shape[0]
    # Initialize the priors with default (uniform dist) if not given by caller
    Pi = Pi if Pi is not None else np.full(K, 1 / K)
    T = len(y)
    T1 = np.empty((K, T), 'd')
    T2 = np.empty((K, T), 'B')

    # Initilaize the tracking tables from first observation
    T1[:, 0] = Pi * B[:, y[0]]
    T2[:, 0] = 0

    # Iterate throught the observations updating the tracking tables
    for i in range(1, T):
        T1[:, i] = np.max(T1[:, i - 1] * A.T * B[np.newaxis, :, y[i]].T, 1)
        T2[:, i] = np.argmax(T1[:, i - 1] * A.T, 1)

    # Build the output, optimal model trajectory
    x = np.empty(T, 'B')
    x[-1] = np.argmax(T1[:, T - 1])
    for i in reversed(range(1, T)):
        x[i - 1] = T2[x[i], i]

    return x, T1, T2

In [82]:
lines = []
words = word_set.copy()
emissions = emission_probs.copy()
with open(dev_file_path, 'r') as file:
    line_buf = []
    for line in file:
        line = line.strip()
        if line:
            if line in word_set:
                line_buf.append(words.index(line))
            else:
                words.append(line)
                emissions
        else:
            lines.append(np.array(line_buf))
            line_buf = []

In [86]:
path, _, _ = viterbi(lines[0], transiton_probs, emission_probs)

In [87]:
[pos_set[i] for i in path]

['DT',
 'NN',
 'POS',
 'NN',
 'MD',
 'VB',
 'VBN',
 'IN',
 'JJ',
 'NN',
 'NNS',
 'DT',
 'NN',
 ',',
 'IN',
 'NNS',
 'IN',
 'NN',
 ',',
 'NN',
 ',',
 'NN',
 'CC',
 'NN',
 '.']

In [89]:
[word_set[i] for i in lines[0]]

['The',
 'economy',
 "'s",
 'temperature',
 'will',
 'be',
 'taken',
 'from',
 'several',
 'vantage',
 'points',
 'this',
 'week',
 ',',
 'with',
 'readings',
 'on',
 'trade',
 ',',
 'output',
 ',',
 'housing',
 'and',
 'inflation',
 '.']