## In this assignment, we will implement a simple rule-based POS tagger

First let's download the universal dependency treebank from the following url https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-4611

Create a folder named `data`. Copy the downloaded `ud-treebanks-v2.9.tgz` file into the current directory and untar it

let us import some libraries

In [None]:
import codecs
import random

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import matplotlib.pyplot as plt

Now let's write some utility functions

### Utility Code

#### Code to read data from CoNLL format

In [None]:
def conllReader(filename, word_field=1, label_field=3):
    sentences = []
    sentence = []
    label_list = []
    
    with codecs.open(filename, 'r', errors='ignore', encoding='utf8') as f_in:
        for line in f_in:
            line = line.strip()
            if line:
                if line.startswith('# '):
                    continue
                    
                word = line.split('\t')[word_field]
                label = line.split('\t')[label_field]
                
                tokens = [word, label]
                sentence.append( tokens )
                
                if label not in label_list:
                    label_list.append( label )
            else:
                if len(sentence) > 0:
                    sentences.append( sentence )
                sentence = []
        f_in.close()
        
    return sentences, label_list

In [None]:
train_split, label_list = conllReader('data/ud-treebanks-v2.9/UD_Tamil-TTB/ta_ttb-ud-train.conllu', word_field=1, label_field=3)

print('Read {0} number of train sentences'.format( len(train_split) ))
print('\nFirst sentence looks like')
print(train_split[0])

print('\n Labels used are')
print(label_list)

In [None]:
def getMax(dictionary):    
    max_key = list(dictionary.keys())[0]
    max_value = dictionary[max_key]
    for key in dictionary:
        if max_value > dictionary[key]:
            max_value = dictionary[key]
            max_key = key
            
    return key

Now we have the data loading part written, let's write a simple Most-Frequent POS tagger 

### Most Frequent POS Tagger

Let's load the train and test sets

In [None]:
train_split, label_list = conllReader('data/ud-treebanks-v2.9/UD_Tamil-TTB/ta_ttb-ud-train.conllu', word_field=1, label_field=3)
test_split, test_label_list = conllReader('data/ud-treebanks-v2.9/UD_Tamil-TTB/ta_ttb-ud-test.conllu', word_field=1, label_field=3)

print('Read {0} number of train sentences'.format( len(train_split) ))
print('Label list in train split is')
print(label_list)

print('\n' * 2)
print('Read {0} number of test sentences'.format( len(test_split) ))
print('Label list in test split is')
print(test_label_list)

combined_label_list = list( set( label_list + test_label_list ) )

For every word in train split, let's get the POS statistics

In [None]:
word_pos_frequency = {}

for every_sentence in train_split:
    for every_token in every_sentence:
        word, label = every_token
        if word in word_pos_frequency:
            if label in word_pos_frequency[word]:
                word_pos_frequency[word][label] += 1
            else:
                word_pos_frequency[word][label] = 1
        else:
            word_pos_frequency[word] = {}
            word_pos_frequency[word][label] = 1
            
print('Total number of words in train split is {0}'.format( len(word_pos_frequency) ) )
first_word = next(iter(word_pos_frequency))
print( 'Word is {0}'.format(first_word) )
print( word_pos_frequency[first_word] )

In [None]:
count = 1
for every_word in word_pos_frequency:
    if len( word_pos_frequency[every_word] ) > 1:
        if count == 1:
            count += 1
            continue
        print( 'Word is {0}'.format(every_word) )
        print( word_pos_frequency[every_word] )
        break

### Evaluate on test split

In [None]:
ground_truth = []
prediction = []

total_num_tokens = 0
tokens_present_in_train = 0

for every_sentence in test_split:
    ground_truth_sentence = []
    prediction_sentence = []

    for every_token in every_sentence:
        word, label = every_token
        ground_truth_sentence.append( label )
        
        total_num_tokens += 1

        if word in word_pos_frequency:
            tokens_present_in_train += 1
            prediction_sentence.append( getMax( word_pos_frequency[word] ) )
        else:
            prediction_sentence.append( random.choice( label_list ) )
            
    ground_truth.append( ground_truth_sentence )
    prediction.append( prediction_sentence )
    
print("Out of {0} number of words in test split, {1} appeared in train split".format( total_num_tokens, tokens_present_in_train ))

In [None]:
flatten_gold_truth = [j for sub in ground_truth for j in sub]
flatten_predictions = [j for sub in prediction for j in sub]

print(classification_report(flatten_gold_truth, flatten_predictions, target_names=combined_label_list, digits=4))

In [None]:
cm = confusion_matrix(flatten_gold_truth, flatten_predictions)

cmd = ConfusionMatrixDisplay(cm, display_labels=combined_label_list)

fig, ax = plt.subplots()
fig.set_figheight(15)
fig.set_figwidth(15)

cmd.plot(xticks_rotation='vertical', ax =ax)

In [None]:
    
# For tasks requiring phrase-level annotation
# from seqeval.metrics import accuracy_score
# from seqeval.metrics import classification_report
# from seqeval.metrics import f1_score
# from seqeval.scheme import IOB1
# print('F1 Score is')
# print( f1_score(ground_truth, prediction) )

# print('Classification report')
# print( classification_report(ground_truth, prediction, scheme=IOB1) )

## Implement Rule-based System here

Let us write a simple rule to tag adverbs

In [None]:
affix_pos_frequency = {}

for every_word in word_pos_frequency:
    bi_gram_character = every_word[ -2: ]
    tri_gram_character = every_word[ -3: ]

    if bi_gram_character in affix_pos_frequency:
        for key in word_pos_frequency[ every_word ]:
            if key in affix_pos_frequency[bi_gram_character]:
                affix_pos_frequency[bi_gram_character][key] += word_pos_frequency[ every_word ][key]
            else:
                affix_pos_frequency[bi_gram_character][key] = word_pos_frequency[ every_word ][key]
    else:
        affix_pos_frequency[bi_gram_character] = word_pos_frequency[every_word]

    if tri_gram_character in affix_pos_frequency:
        for key in word_pos_frequency[ every_word ]:
            if key in affix_pos_frequency[tri_gram_character]:
                affix_pos_frequency[tri_gram_character][key] += word_pos_frequency[ every_word ][key]
            else:
                affix_pos_frequency[tri_gram_character][key] = word_pos_frequency[ every_word ][key]
    else:
        affix_pos_frequency[tri_gram_character] = word_pos_frequency[every_word]


In [None]:
ground_truth = []
prediction = []

for every_sentence in test_split:
    ground_truth_sentence = []
    prediction_sentence = []

    word_index = 0
    for every_token in every_sentence:
        word_index = word_index + 1
        word, label = every_token
        ground_truth_sentence.append( label )
        
        word_bi_gram = every_token[0][-2:]
        word_tri_gram = every_token[0][-3:]
        
        if word in word_pos_frequency:
            prediction_sentence.append( getMax( word_pos_frequency[word] ) )
        elif word_tri_gram in affix_pos_frequency:
            prediction_sentence.append( getMax( affix_pos_frequency[word_tri_gram] ) )
        elif word_bi_gram in affix_pos_frequency:
            prediction_sentence.append( getMax( affix_pos_frequency[word_bi_gram] ) )
        else:
            prediction_sentence.append( random.choice( combined_label_list ) )

    ground_truth.append( ground_truth_sentence )
    prediction.append( prediction_sentence )

In [None]:
flatten_gold_truth = [j for sub in ground_truth for j in sub]
flatten_predictions = [j for sub in prediction for j in sub]

print(classification_report(flatten_gold_truth, flatten_predictions, target_names=combined_label_list, digits=4))