In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/wall-street-journal/WSJ_02-21.pos


In [2]:
import string
from collections import defaultdict

**Reading Data**

In [3]:
file = '/kaggle/input/wall-street-journal/WSJ_02-21.pos'

In [4]:
with open(file,'r') as f:
    lines = f.readlines()

In [5]:
lines[:5]

['In\tIN\n', 'an\tDT\n', 'Oct.\tNNP\n', '19\tCD\n', 'review\tNN\n']

In [6]:
print("\t\tWord", "\tTag\n")
for i in range(5):
    print(f'line number {i+1}: {lines[i]}')

		Word 	Tag

line number 1: In	IN

line number 2: an	DT

line number 3: Oct.	NNP

line number 4: 19	CD

line number 5: review	NN



In [7]:
words = [line.split('\t')[0] for line in lines]
words[0]

'In'

In [8]:
freq = defaultdict(int)
for word in words:
    freq[word] += 1

In [9]:
vocab = [k for k,v in freq.items() if (v>1 and k!='\n')]

In [10]:
vocab.sort()

In [11]:
def assign_unk(word):
    punct = string.punctuation
    noun_suffs = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood", "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", "scape", "ship", "ty"]
    verb_suffs = ["ate", "ify", "ise", "ize"]
    adj_suffs = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive", "less", "ly", "ous"]
    adv_suffs = ["ward", "wards", "wise"]
    
    if any(char.isdigit() for char in word):
        return "--unk_digit--"

    elif any(char in punct for char in word):
        return "--unk_punct--"

    elif any(char.isupper() for char in word):
        return "--unk_upper--"

    elif any(word.endswith(suff) for suff in noun_suffs):
        return "--unk_noun--"

    elif any(word.endswith(suff) for suff in verb_suffs):
        return "--unk_verb"

    elif any(word.endswith(suff) for suff in adj_suffs):
        return "--unk_adj--"

    elif any(word.endswith(suff) for suff in adv_suffs):
        return "--unk_adv--"

    return "unk--"
    

In [12]:
def get_word_tag(line, vocab):
    if not(line.split()):
        word = "--n--"
        tag = "--s--"
    else:
        word,tag = line.split()
        if word not in vocab:
            word = assign_unk(word)
    return word, tag

In [13]:
get_word_tag('\n', vocab)

('--n--', '--s--')

In [14]:
get_word_tag('In\tIN\n',vocab)

('In', 'IN')

In [15]:
get_word_tag('tardigrade\tNN\n', vocab)

('unk--', 'NN')

In [16]:
get_word_tag('scrutinize\tVB\n', vocab)

('--unk_verb', 'VB')

In [17]:
tags = ['RB','NN','TO']
tags

['RB', 'NN', 'TO']

In [18]:
trans_counts = {
    ('NN', 'NN'): 16241,
    ('RB', 'RB'): 2263,
    ('TO', 'TO'): 2,
    ('NN', 'TO'): 5256,
    ('RB', 'TO'): 855,
    ('TO', 'NN'): 734,
    ('NN', 'RB'): 2431,
    ('RB', 'NN'): 358,
    ('TO', 'RB'): 200
}

In [19]:
num_tags = len(tags)
trans_matrix = np.zeros((num_tags, num_tags))
trans_matrix

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [20]:
tags.sort()
tags

['NN', 'RB', 'TO']

In [21]:
for i in range(num_tags):
    for j in range(num_tags):
        tag_tuple = (tags[i],tags[j])
        trans_matrix[i,j] = trans_counts.get(tag_tuple)

trans_matrix

array([[1.6241e+04, 2.4310e+03, 5.2560e+03],
       [3.5800e+02, 2.2630e+03, 8.5500e+02],
       [7.3400e+02, 2.0000e+02, 2.0000e+00]])

In [22]:
def print_mat(matrix):
    print(pd.DataFrame(matrix,index=tags,columns=tags))

In [23]:
print_mat(trans_matrix)

         NN      RB      TO
NN  16241.0  2431.0  5256.0
RB    358.0  2263.0   855.0
TO    734.0   200.0     2.0


In [24]:
rows_sum = trans_matrix.sum(axis=1,keepdims=True)
trans_matrix/=rows_sum
print_mat(trans_matrix)

          NN        RB        TO
NN  0.678745  0.101596  0.219659
RB  0.102992  0.651036  0.245972
TO  0.784188  0.213675  0.002137


In [25]:
import math
trans_mat = np.copy(trans_matrix)
d = np.diag(trans_mat)
d = np.reshape(d,(3,1))
d

array([[0.67874457],
       [0.65103567],
       [0.00213675]])

In [26]:
# d += np.vectorize(np.log)(rows_sum)
d = d + np.log(rows_sum)
np.fill_diagonal(trans_mat, d)
print_mat(trans_mat)

           NN        RB        TO
NN  10.761549  0.101596  0.219659
RB   0.102992  8.804673  0.245972
TO   0.784188  0.213675  6.843752
