In [3]:
# Goals: deal with words that aren't present in this vocuabulary when working with other text sources
# Also, read text files, work with defaultdict, and work with string data

In [8]:
import string
from collections import defaultdict

In [9]:
with open('WSJ_02-21.pos', 'r') as f:
    lines = f.readlines()


In [10]:
# Print columns for reference
print("\t\tWord", "\tTag\n")

# Print first five lines of the dataset
for i in range(5):
    print(f'line number {i+1}: {lines[i]}')

		Word 	Tag

line number 1: In	IN

line number 2: an	DT

line number 3: Oct.	NNP

line number 4: 19	CD

line number 5: review	NN



In [12]:
lines[0]

'In\tIN\n'

In [23]:
# Create a vocabulary from all words that appeared at least 2 times in the dataset

# Get the words from each line in the dataset
words = [line.split('\t')[0] for line in lines]

# Define defaultdict of type 'int'
freq = defaultdict(int)

# Count frequency of occurrence for each word in the dataset
for word in words:
        freq[word] += 1

# Create the vocabulary by filtering the 'freq' dictionary
vocab = [k for k, v in freq.items() if (v > 1 and k != '\n')]
vocab.sort()

for i in range(4000, 4005):
    print(vocab[i])

# Generally would write the vocabulary to disk

Early
Earnings
Earth
Earthquake
East


In [28]:
# Apply the covab to a new corpus. We will have missing words. Classify the type of each unknown word and assign it a corresponding `unknown token`

def assign_unk(word):
    # Assign tokens to unknown words

    punct = set(string.punctuation)

    # Suffixes
    noun_suffix = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood", "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", "scape", "ship", "ty"]
    verb_suffix = ["ate", "ify", "ise", "ize"]
    adj_suffix = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive", "less", "ly", "ous"]
    adv_suffix = ["ward", "wards", "wise"]

     # Loop the characters in the word, check if any is a digit
    if any(char.isdigit() for char in word):
        return "--unk_digit--"

    # Loop the characters in the word, check if any is a punctuation character
    elif any(char in punct for char in word):
        return "--unk_punct--"

    # Loop the characters in the word, check if any is an upper case character
    elif any(char.isupper() for char in word):
        return "--unk_upper--"

    # Check if word ends with any noun suffix
    elif any(word.endswith(suffix) for suffix in noun_suffix):
        return "--unk_noun--"

    # Check if word ends with any verb suffix
    elif any(word.endswith(suffix) for suffix in verb_suffix):
        return "--unk_verb--"

    # Check if word ends with any adjective suffix
    elif any(word.endswith(suffix) for suffix in adj_suffix):
        return "--unk_adj--"

    # Check if word ends with any adverb suffix
    elif any(word.endswith(suffix) for suffix in adv_suffix):
        return "--unk_adv--"
    
    # If none of the previous criteria is met, return plain unknown
    return "--unk--"

# By augmenting the dataset to include these unknown word tokens you are helping the tagger have a better idea of the appropriate tag for these words


In [33]:
# Getting the correct tag for a word
def get_word_tag(line, vocab):
    if not line.split():
        word = '--n--'
        tag = '--s--'
    else:
        word, tag = line.split()
        if word not in vocab:
            word = assign_unk(word)
    return word, tag

In [34]:
get_word_tag('\n', vocab)

('--n--', '--s--')

In [35]:
get_word_tag('In\tIN\n', vocab)

('In', 'IN')

In [36]:
get_word_tag('tardigrade\tNN\n', vocab)

('--unk--', 'NN')

In [37]:
get_word_tag('scrutinize\tVB\n', vocab)

('--unk_verb--', 'VB')

# NLP_C2_W2_lecture_notebook_numpy.ipynb

In [41]:
# Create a matrix using some tag information and then modify it
import numpy as np
import pandas as pd

In [42]:
# Just use a subset of tags
tags = ['RB', 'NN', 'TO']

# Count number of transitions from one to another tag. Use when working with tags only.

# Emission is number of times a particular pair of (tag, word) appeared in the training dataset. Use when working with tags and words

In [43]:
transition_counts = {
    ('NN', 'NN'): 16241,
    ('RB', 'RB'): 2263,
    ('TO', 'TO'): 2,
    ('NN', 'TO'): 5256,
    ('RB', 'TO'): 855,
    ('TO', 'NN'): 734,
    ('NN', 'RB'): 2431,
    ('RB', 'NN'): 358,
    ('TO', 'RB'): 200
}

In [44]:
# move these frequencies to numpy array

num_tags = len(tags)
transition_matrix = np.zeros((num_tags, num_tags))

transition_matrix

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [45]:
transition_matrix.shape

(3, 3)

In [48]:
sorted_tags = sorted(tags)

sorted_tags

['NN', 'RB', 'TO']

In [51]:
# Fill matrix with correct values
for i in range(num_tags):
    for j in range(num_tags):
        tag_tuple = (sorted_tags[i], sorted_tags[j])
        transition_matrix[i,j] = transition_counts.get(tag_tuple)

transition_matrix

array([[1.6241e+04, 2.4310e+03, 5.2560e+03],
       [3.5800e+02, 2.2630e+03, 8.5500e+02],
       [7.3400e+02, 2.0000e+02, 2.0000e+00]])

In [54]:
def print_matrix(matrix):
    print(pd.DataFrame(matrix, index=sorted_tags, columns=sorted_tags))

print_matrix(transition_matrix)

         NN      RB      TO
NN  16241.0  2431.0  5256.0
RB    358.0  2263.0   855.0
TO    734.0   200.0     2.0


In [55]:
transition_matrix = transition_matrix/10

print_matrix(transition_matrix)

        NN     RB     TO
NN  1624.1  243.1  525.6
RB    35.8  226.3   85.5
TO    73.4   20.0    0.2


In [57]:
rows_sum = transition_matrix.sum(axis=1, keepdims=True)
transition_matrix = transition_matrix / rows_sum
print_matrix(transition_matrix)

          NN        RB        TO
NN  0.678745  0.101596  0.219659
RB  0.102992  0.651036  0.245972
TO  0.784188  0.213675  0.002137


In [58]:
transition_matrix.sum(axis=1, keepdims=True)

array([[1.],
       [1.],
       [1.]])

In [60]:
import math

# Copy transition matrix for for-loop example
t_matrix_for = np.copy(transition_matrix)

# Copy transition matrix for numpy functions example
t_matrix_np = np.copy(transition_matrix)

# Loop values in the diagonal
for i in range(num_tags):
    t_matrix_for[i, i] =  t_matrix_for[i, i] + math.log(rows_sum[i])

# Print matrix
print_matrix(t_matrix_for)

          NN        RB        TO
NN  8.458964  0.101596  0.219659
RB  0.102992  6.502088  0.245972
TO  0.784188  0.213675  4.541167


In [68]:
d = np.diag(t_matrix_np)
d = np.reshape(d, (3,1))
d = d + np.vectorize(math.log)(rows_sum)

np.fill_diagonal(t_matrix_np, d)

print_matrix(t_matrix_np)

          NN        RB        TO
NN  8.458964  0.101596  0.219659
RB  0.102992  6.502088  0.245972
TO  0.784188  0.213675  4.541167


In [69]:
t_matrix_for == t_matrix_np

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])