# Imports

In [192]:
from collections import defaultdict
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# Custom implementation

In [193]:
class tagger():
    def __init__(self):
        self.emissions = defaultdict(lambda: defaultdict(int))
        self.transitions = defaultdict(lambda: defaultdict(int))
        self.tag_counts = defaultdict(int)
        self.tags = []
        self.vocab = []
        self.V = 0

    def calculate_emission_probs(self, words):
        # Use add one smoothing for the emission probabilities
        emission_probs = defaultdict(lambda: defaultdict(float))
        smoothed_words = []

        for tag in self.tags:
            for word in words:
                emission_probs[tag][word] = (self.emissions[tag][word] + 1) / (self.tag_counts[tag] + self.V)
                if not word in self.vocab:
                    smoothed_words.append(word)

        return emission_probs, smoothed_words

    def calculate_transitions(self):
        # For each tag in tags, calculate the transition probabilities
        for prev_tag, tags in self.transitions.items():
            total = sum(tags.values())
            for tag, count in tags.items():
                self.transitions[prev_tag][tag] = count / total
        

    def viterbi(self, words):
        T = len(self.tags)
        W = len(words)

        score = np.zeros((T, W))
        backPtr = np.zeros((T, W), dtype=int)

        emission_probs, smoothed_words = self.calculate_emission_probs(words)

        # Initialize the first column
        for i, tag in enumerate(self.tags):
            score[i, 0] = self.transitions['LINESTART'][tag] * emission_probs[tag][words[0]]
            backPtr[i, 0] = 0

        # Iterate over the rest of the columns
        for i in range(1, W):
            for j, tag in enumerate(self.tags):
                max_score = 0
                max_index = 0
                for k, prev_tag in enumerate(self.tags):
                    s = score[k, i-1] * self.transitions[prev_tag][tag] * emission_probs[tag][words[i]]
                    if s > max_score:
                        max_score = s
                        max_index = k
                score[j, i] = max_score
                backPtr[j, i] = max_index

        # Find the best path
        best_path = [np.argmax(score[:, -1])]

        for i in range(W-1, 0, -1):
            best_path.insert(0, backPtr[best_path[0], i])

        return [self.tags[i] for i in best_path], smoothed_words

    def train(self, filename):
        prev_tag = 'LINESTART'

        with open(filename, 'r', encoding='utf-8') as f:
            for line in f:
                if not line.strip():
                    continue
                elif line.strip().startswith('# sent_id'):
                    prev_tag = 'LINESTART'
                    continue
                elif line.strip().startswith('#'):
                    continue

                parts = line.split()
                word, tag = parts[1], parts[3]

                self.emissions[tag][word] += 1
                self.transitions[prev_tag][tag] += 1

                if tag not in self.tags:
                    self.tags.append(tag)

                if word not in self.vocab:
                    self.vocab.append(word)

                self.tag_counts[tag] += 1

                prev_tag = tag

        self.V = len(self.vocab)

        # calculate transition probabilities
        self.calculate_transitions()

    def predict(self, filename, output_filename):
        print('', file=open(output_filename, 'w'), end='')

        data_tags = []
        predicted_tags = []
        smoothed = []

        with open(filename, 'r', encoding='utf-8') as f:
            sent_words = []
            sent_tags = []
            sent_id = None

            for line in f:
                if not line.strip():
                    continue
                elif line.strip().startswith('# sent_id'):
                    # if sent_words is empty, skip
                    if not sent_words:
                        sent_id = line.split()[3]
                        continue

                    sent_preds, sent_smoothed = self.viterbi(sent_words)

                    for i, pred in enumerate(sent_preds):
                        print(f'{sent_id}\t{i+1}\t{sent_words[i]}\t{pred}\n', file=open(output_filename, 'a', encoding='utf-8'))

                    # add the sent tags to the data tags list
                    data_tags.extend(sent_tags)

                    # add the sentence predictions to the predicted tags list
                    predicted_tags.extend(sent_preds)

                    # clear the sentence words and tags
                    sent_words = []
                    sent_tags = []

                    # add the smoothed words to the total smoothed words list
                    smoothed.extend(sent_smoothed)

                    sent_id = line.split()[3]
                    continue
                elif line.strip().startswith('#'):
                    continue

                parts = line.split()

                word, tag = parts[1], parts[3]

                sent_words.append(word)
                sent_tags.append(tag)

            # if sent_words is not empty, predict the tags
            if sent_words:
                sent_preds, sent_smoothed = self.viterbi(sent_words)

                for i, pred in enumerate(sent_preds):
                    print(f'{sent_id}\t{i+1}\t{sent_words[i]}\t{pred}', file=open(output_filename, 'a', encoding='utf-8'))

                # add the sent tags to the data tags list
                data_tags.extend(sent_tags)

                # add the sentence predictions to the predicted tags list
                predicted_tags.extend(sent_preds)

                # add the smoothed words to the total smoothed words list
                smoothed.extend(sent_smoothed)
        
        smoothed = set(smoothed)
        smoothed = len(smoothed)

        return data_tags, predicted_tags, smoothed
    
    def get_scores(self, data_tags, predicted_tags):
        accuracy = accuracy_score(data_tags, predicted_tags)
        recall = recall_score(data_tags, predicted_tags, average='weighted', zero_division=0)
        precision = precision_score(data_tags, predicted_tags, average='weighted', zero_division=0)
        f1 = f1_score(data_tags, predicted_tags, average='weighted', zero_division=0)

        return accuracy, recall, precision, f1

# Main

### File Paths

In [194]:
train_file = "./train.txt"
test_file = "./test.txt"
train_output_file = "./viterbi_train_predictions.tsv"
test_output_file = "./viterbi_test_predictions.tsv"

### Class Initialization

In [195]:
customTagger = tagger()

### Training

In [196]:
customTagger.train(train_file)

In [197]:
data_tags, predicted_tags, smoothed = customTagger.predict(train_file, train_output_file)

In [198]:
accuracy, recall, precision, f1 = customTagger.get_scores(data_tags, predicted_tags)

print("For train file: ", train_file)
print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"F1: {f1}")
print(f"Smoothed words: {smoothed}")

For train file:  ./train.txt
Accuracy: 0.8310089344083679
Recall: 0.8310089344083679
Precision: 0.8359764794990692
F1: 0.821836977061188
Smoothed words: 0


### Testing

In [199]:
data_tags, predicted_tags, smoothed = customTagger.predict(test_file, test_output_file)

In [200]:
accuracy, recall, precision, f1 = customTagger.get_scores(data_tags, predicted_tags)

print("For test file: ", test_file)
print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"F1: {f1}")
print(f"Smoothed words: {smoothed}")

For test file:  ./test.txt
Accuracy: 0.701007326007326
Recall: 0.701007326007326
Precision: 0.731895558358753
F1: 0.676887398811972
Smoothed words: 284
