In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [None]:
from google.colab import files
uploaded = files.upload()

Saving BrownCorpus.txt to BrownCorpus (1).txt
Saving BrownToUniversalTagMap.txt to BrownToUniversalTagMap (1).txt


In [None]:
def load_pos_corpus(filename):
  with open(filename, 'r', encoding='utf-8') as f:
    lines = f.readlines()

  sentences = []
  token_count = 0

  for line in lines:
    line = line.strip()

    if line:
      tokens = line.split()
      sentence = []

      for token in tokens:
        spelling, tag = token.rsplit('_', 1)
        spelling = spelling.strip().lower()
        tag = tag.strip()

        if len(tag) == 1 or (len(tag) > 1 and tag[1] != '|'):
          sentence.append((spelling, tag))
          token_count += 1

      sentences.append(sentence)
  print(f"Loaded Brown Corpus with {token_count} tokens")
  return sentences

brown_corpus = load_pos_corpus("BrownCorpus.txt")

Loaded Brown Corpus with 1137452 tokens


In [None]:
def load_tag_mapping(filename):
  with open(filename, 'r', encoding='utf-8') as f:
    lines = f.readlines()

  tag_mapping = {}

  for line in lines:
    tags = [t.strip() for t in line.strip().split("\t") if t.strip()]

    if len(tags) == 2:
      brown_tag, universal_tag = tags
      brown_tag = brown_tag.strip()
      universal_tag = universal_tag.strip()

      if brown_tag not in tag_mapping:
        tag_mapping[brown_tag] = universal_tag
      else:
        print("Skipping malformed line: " + line) #debug

  print(f"Loaded Brown to Universal tag mapping with {len(tag_mapping)} tag pairs")
  return tag_mapping

brown_to_universal_tag_mapping = load_tag_mapping("BrownToUniversalTagMap.txt")

Loaded Brown to Universal tag mapping with 478 tag pairs


In [None]:
def convert_pos_tags(tag_mapping, sentences):

  converted_sentences = []

  for sentence in sentences:
    converted_sentence = []

    for word, brown_tag in sentence:
      universal_tag = tag_mapping.get(brown_tag, 'UNKNOWN')
      converted_sentence.append((word, universal_tag))

    converted_sentences.append(converted_sentence)

  return converted_sentences

universal_corpus = convert_pos_tags(brown_to_universal_tag_mapping, brown_corpus)
print("Converted tags from Brown to universal")

Converted tags from Brown to universal


In [None]:
def split_data_set(data_set, split_fraction = 0.8):

  split_index = int(len(data_set)*split_fraction)
  training_data_set = data_set[:split_index]
  test_data_set = data_set[split_index:]

  return training_data_set, test_data_set

training_data_set, test_data_set = split_data_set(universal_corpus)
print(f"Data set split with {len(training_data_set)} training sentences and {len(test_data_set)} test sentences")

Data set split with 45282 training sentences and 11321 test sentences


In [None]:
def tokenize_data_set(data_set):
  return [[word for word, tag in sentence] for sentence in data_set]

tokenized_test_data_set = tokenize_data_set(test_data_set)
print("Tokenized test data set.")

Tokenized test data set.


In [None]:
from nltk.tag.perceptron import PerceptronTagger
tagger = PerceptronTagger()

tagged_sentences = tagger.tag_sents(tokenized_test_data_set)

In [None]:
from nltk.tag.mapping import map_tag

def ptb_to_universal(sentences):

  universal_sentences = []

  for sentence in sentences:
    universal_sentence = [(word, map_tag('en-ptb', 'universal', ptb_tag)) for word, ptb_tag in sentence]
    universal_sentences.append(universal_sentence)

  return universal_sentences

universal_tagged_sentences = ptb_to_universal(tagged_sentences)

In [None]:
def compute_accuracy(predicted_sentences, true_sentences):

  correct_tags = 0
  total_tags = 0
  debug = 0

  for predicted_sentence, true_sentence in zip(predicted_sentences, true_sentences):
    for (predicted_word, predicted_tag), (true_word, true_tag) in zip(predicted_sentence, true_sentence):
      if predicted_tag == true_tag:
        correct_tags += 1

      total_tags += 1

  accuracy = correct_tags/total_tags if total_tags > 0 else 0
  print("Total number of tags: ", total_tags)
  print("Number of correct tags: ", correct_tags)
  print("Accuracy: ", accuracy)
  return accuracy

accuracy = compute_accuracy(universal_tagged_sentences, test_data_set)


Total number of tags:  174495
Number of correct tags:  153051
Accuracy:  0.8771082265967506
