In [None]:
# --- Before your go ----
# 1. Rename Assignment-03-###.ipynb where ### is your student ID.
# 2. The deadline of Assignment-03 is 23:59pm, 06-05-2024


# --- Explore HMM POS Taggers using Brown corpus ---
# In this assignment, you will explore three taggers for a Brown corpus.
# import your packages here

In [None]:

# Task 1 --- Load and explore your data ---
# 1). load train/test samples from Brown corpus files, brown-train.txt, brown-test.txt.
# 2). load all 12 tags from brown-tag.txt and print it out
# 3). counting how many sentences and words in both train and test datasets.
# 4). for each tag, counting how many words in train and test. e.g, tag1: [count_tr, count_te]

# Your code

### 1.1 load train/test samples

In [21]:
def open_brown_files(type: str) -> tuple[list]:
    """
    Open the Brown corpus files based on the given type.
    
    Args:
    - type: The type of the Brown corpus, should be in ['train', 'test', 'tag'].
    
    Return:
    - content: The content of the corpus.
    """
    assert type in ['train', 'test', 'tag']
    
    # open the corpus file and read header/content from it
    with open("brown-{}.txt".format(type), 'r', encoding='utf-8') as f:
        content = [line.strip() for line in f.readlines()]
            
    return content
        
train_content = open_brown_files('train')
test_content = open_brown_files('test')

### 1.2 load all tags

In [12]:
tag_content = open_brown_files('tag')
print(tag_content)

['.', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PRT', 'VERB', 'X']


### 1.3 count sentences and words

In [32]:
INTERVAL = " " * 4

def count_sentence_word(content: list[str]) -> tuple[int]:
    """
    Count the number of sentences and words in the given content.
    
    Args:
    - content: A list of string returned by the function `open_brown_files`.
    
    Return:
    - num_sentence: The number of sentences in the given content.
    - num_word: The number of words in the given content.
    """
    num_sentence, num_word = 0, 0
    
    for line in content:
        # if the line startwith "b100-", then it's the start of a sentence
        if line.startswith('b100-'):
            num_sentence += 1
            continue
        # if line is empty, then the we reach the end of a sentence
        if line == '':
            continue
        # if the tag is not '.', then the text in the line is a word
        tag = line.split('\t')[1]
        if tag != '.':
            num_word += 1
    return num_sentence, num_word
    
train_sentence, train_word = count_sentence_word(train_content)
test_sentence, test_word = count_sentence_word(test_content)

# print the results
print("type " + INTERVAL + "sentences" + INTERVAL + " words")
print("=" * 28)
print("train" + INTERVAL + f"{train_sentence:^9}" + INTERVAL + f"{train_word:^5}") 
print("test " + INTERVAL + f"{test_sentence:^9}" + INTERVAL + f"{test_word:^5}") 

type     sentences     words
train      45800      810604
test       11540      203023


### 1.4 count frequency of each tag

In [34]:
def count_frequency_tag(content: list[str]) -> dict:
    """
    Count the ocurr frequency of each tag in the given content.
    
    Args:
    - content: A list of string returned by the function `open_brown_files`.
    
    Return:
    - tag_count: A dict containing the information of frequency of each tag.
    """
    tag_count = {tag: 0 for tag in tag_content}
    
    for line in content:
        if line.startswith("b100-") or line == '':
            continue
        curr_tag = line.split('\t')[1]
        tag_count[curr_tag] += 1
        
    return tag_count

train_tag_freq = count_frequency_tag(train_content)
test_tag_freq = count_frequency_tag(test_content)

for tag in tag_content:
    print("{:>4}: [{:>6}, {:>6}]".format(tag, train_tag_freq[tag], test_tag_freq[tag]))


   .: [117723,  29842]
 ADJ: [ 66985,  16736]
 ADP: [115752,  29014]
 ADV: [ 44765,  11474]
CONJ: [ 30455,   7696]
 DET: [109418,  27601]
NOUN: [220451,  55107]
 NUM: [ 11921,   2953]
PRON: [ 39657,   9677]
 PRT: [ 23889,   5940]
VERB: [146199,  36551]
   X: [  1112,    274]


In [None]:
# Task 2 --- Method 1: Build a baseline method, namely, the most frequent tagger ---
#     If you can recall, we introduced a strong baseline method (See Dan's book in 
# https://web.stanford.edu/~jurafsky/slp3/ed3book_jan72023.pdf Page 164.),
#     where we label each word by using the most frequent-used tag associated with it.
# 1). find the most frequent class label for each word in the training data.
#     For example, {tr_word_1:tag_1,tr_word_2:tag_2,...}
# 2). use your b`uilt method to predict tags for both train and test datasets.
#     You should print out two values: the accuracies of train and test samples.
#     You would expect that the accuracy on train will be > 0.9 (but never = 1.0) and higher than on test.

# Notice: since there are unkown words in test samples. 
#  Following ways could handle this (choose one or create your own): 
#  1). mark all words that appear only once in the data with a "UNK-x" tag
#  2). tag every out-of-vocabulary word with the majority tag among all training samples.
#  3). find more methods in https://github.com/Adamouization/POS-Tagging-and-Unknown-Words

# Your code

In [None]:
# Task 3 --- Method 2: Build an HMM tagger ---
# 1) You should use nltk.tag.HiddenMarkovModelTagger to build an HMM tagger.
#    It has parameters: symbols, states, transitions, outputs, priors, transform (ignore it).
#    Specify these parameters properly. For example, you can use MLE to estimate transitions, outputs and priors.
#    That is, MLE to estimate matrix A (transition matrix), and matrix B (output probabilites) (See. Page 8.4.3)
# 2) After build your model, report both the accuracy of HMM tagger for train samples and test samples.
# 
# 3) Compared with your baseline method, discuss that why your HMM tagger is better/worse than baseline method.

# Notice: You may also need to handle unknown words just like Task 2.

# Your code

In [None]:
# Task 4 --- Method 3: Fine-tuning on BERT-base model for POS-tagging ---
# 
# 1) You may download a BERT model (say, you choose BERT-base cased) 
#    and use tools in https://github.com/huggingface/transformers
# 
# 2) After build your model, report both the accuracy of BERT tagger for train samples and test samples.
# 
# 3) Compared with Method 1,2, discuss that why your BERT tagger is better/worse than these two.