In [6]:
# Importing packages and loading in the data set 
import pandas as pd
from collections import defaultdict
import math
import numpy as np
import string
import string

In [9]:
class bruteforce:


  def __init__(self,training,voc, test_corpus, test):

    # Punctuation characters
    self.punct = set(string.punctuation)

    # Morphology rules used to assign unknown word tokens
    self.noun_suffix = ["action", "age", "ance", "cy", "dom", "ee", "ence", "er", "hood", "ion", "ism", "ist", "ity", "ling", "ment", "ness", "or", "ry", "scape", "ship", "ty"]
    self.verb_suffix = ["ate", "ify", "ise", "ize"]
    self.adj_suffix = ["able", "ese", "ful", "i", "ian", "ible", "ic", "ish", "ive", "less", "ly", "ous"]
    self.adv_suffix = ["ward", "wards", "wise"]


    with open(training, 'r') as f:
      training_corpus = f.readlines()

    with open(voc, 'r') as f:
      voc_l = f.read().split('\n')

    # vocab: dictionary that has the index of the corresponding words
    self.vocab = {} 

    # Get the index of the corresponding words. 
    for i, word in enumerate(sorted(voc_l)): 
        self.vocab[word] = i   

    #Reading the test corpus
    with open(test_corpus, 'r') as f:
      self.y = f.readlines()



    #corpus without tags, preprocessed
    self._, self.prep = self.preprocess(self.vocab, test)




    
    self.cnt = 0
    for k,v in self.vocab.items():
        self.cnt += 1
        if self.cnt > 20:
            break

    #create dictionaries
    self.emission_counts, self.transition_counts, self.tag_counts = self.create_dictionaries(training_corpus, self.vocab)

    # get all the POS states
    self.states = sorted(self.tag_counts.keys())


    accuracy_predict_pos = self.predict_pos(self.prep, self.y, self.emission_counts, self.vocab, self.states)
    print(f"Accuracy of prediction using the brute force method is {accuracy_predict_pos*100:4f} %")



  def get_word_tag(self,line, vocab): 
    #helper function
      if not line.split():
          word = "--n--"
          tag = "--s--"
          return word, tag
      else:
          word, tag = line.split()
          if word not in vocab: 
              word = self.assign_unk(word)
          return word, tag
      return None 


  def preprocess(self, vocab, data_fp):
      '''
      Input: 
          data_fp: file pointer to test data
          vocab: a dictionary where keys are words in vocabulary and value is an index
          
      Output: 
          orig: original data with words and the assigned POS tags
          prep: Data without the POS tags for testing
      '''
      orig = []
      prep = []

      with open(data_fp, "r") as data_file:

          for cnt, word in enumerate(data_file):
              if not word.split():
                  orig.append(word.strip())
                  word = "--n--"
                  prep.append(word)
                  continue
              elif word.strip() not in vocab:
                  orig.append(word.strip())
                  word = self.assign_unk(word)
                  prep.append(word)
                  continue
              else:
                  orig.append(word.strip())
                  prep.append(word.strip())

      assert(len(orig) == len(open(data_fp, "r").readlines()))
      assert(len(prep) == len(open(data_fp, "r").readlines()))

      return orig, prep


  def assign_unk(self, tok):
    #Assign unk tags
      if any(char.isdigit() for char in tok):
          return "--unk_digit--"
      elif any(char in self.punct for char in tok):
          return "--unk_punct--"
      elif any(char.isupper() for char in tok):
          return "--unk_upper--"
      elif any(tok.endswith(suffix) for suffix in self.noun_suffix):
          return "--unk_noun--"
      elif any(tok.endswith(suffix) for suffix in self.verb_suffix):
          return "--unk_verb--"
      elif any(tok.endswith(suffix) for suffix in self.adj_suffix):
          return "--unk_adj--"
      elif any(tok.endswith(suffix) for suffix in self.adv_suffix):
          return "--unk_adv--"
      return "--unk--"




  def create_dictionaries(self, training_corpus, vocab):
      '''
      Input: 
          prep: a preprocessed version of 'y'. A list with the 'word' component of the tuples.
          training_corpus: a corpus composed of a list of tuples where each tuple consists of (word, POS)
          vocab: a dictionary where keys are words in vocabulary and value is an index
          
      Output: 
          emission_counts: a dictionary where the keys are (tag,word) tuples and the value is the count
          transition_counts: a dictionary where the keys are (prev_tag,curr_tag) tuples and the value is the count
          tag_counts: a dictionary where the keys are tags and the value is the count
      '''
      #function to create dictionaries for counts of emissions, transitions, and tags
      emission_counts = defaultdict(int)
      transition_counts = defaultdict(int)
      tag_counts = defaultdict(int)
      
      prev_tag = '--s--' 

      i = 0 

      for word_tag in training_corpus:

          i += 1

          if i % 50000 == 0:
              print(f"word count = {i}")
          word,tag = self.get_word_tag(word_tag,vocab)
          transition_counts[(prev_tag,tag)] += 1
          emission_counts[(tag,word)] += 1
          tag_counts[tag] += 1
          prev_tag = tag
          
      return emission_counts, transition_counts, tag_counts


  def predict_pos(self, prep, y, emission_counts, vocab, states):
      '''
      Input: 
          prep: a preprocessed version of 'y'. A list with the 'word' component of the tuples.
          y: a corpus composed of a list of tuples where each tuple consists of (word, POS)
          emission_counts: a dictionary where the keys are (tag,word) tuples and the value is the count
          vocab: a dictionary where keys are words in vocabulary and value is an index
          states: a sorted list of all possible tags for this assignment
      Output: 
          accuracy: Number of times you classified a word correctly
      '''
      
      # Initialize the number of correct predictions to zero
      correct_pred=0
      
      # Get the (tag, word) tuples, stored as a set
      y_tup=set(emission_counts.keys())
      # Get the number of (word, POS) tuples in the corpus 'y'
      number=len(y)


          # Split the (word, POS) string into a list of two items
      for w, y_tup in zip(prep,y):
            l=y_tup.split()
          
          # Verify that y_tup contain both word and POS
            if(len(l)==2):
              # Set the true POS label for this word
              true=l[1]

              # If the y_tup didn't contain word and POS, go to next word
            else:
              pass
          # If the word is in the vocabulary...
            final_ct=0
            final_pos=''
            if w in vocab:
              for pos in states:
                k=(pos,w)
  
                  # define the key as the tuple containing the POS and word
                  # check if the (pos, word) key exists in the emission_counts dictionary
                if k in emission_counts.keys():

                  count=emission_counts[k]

                  # get the emission count of the (pos,word) tuple 

                      # keep track of the POS with the largest count
                  if count>final_ct:
                    final_ct=count
                          # update the final count (largest count)
                    final_pos=pos
                          # update the final POS

              # If the final POS (with the largest count) matches the true POS:
            if final_pos== true:
              correct_pred+=1   
                  # Update the number of correct predictions

              
      ### END CODE HERE ###
      accuracy = correct_pred/ number
      
      return accuracy


In [10]:
obj= bruteforce(training="WSJ-2_21.pos", voc= "hmm_vocab.txt", test_corpus="WSJ-24.pos", test="test.words.txt")

word count = 50000
word count = 100000
word count = 150000
word count = 200000
word count = 250000
word count = 300000
word count = 350000
word count = 400000
word count = 450000
word count = 500000
word count = 550000
word count = 600000
word count = 650000
word count = 700000
word count = 750000
word count = 800000
word count = 850000
word count = 900000
word count = 950000
Accuracy of prediction using the brute force method is 88.885640 %
