Test Data

In [22]:
def parse_training_data(file_path):
    sentences = []  # List to store parsed sentences
    vocab = set()  # Set to store unique tokens
    pos_tags = set()  # Set to store unique POS tags
    with open(file_path, 'r', encoding='utf-8') as file:
        current_sentence = []  # List to store tokens of the current sentence
        for line in file:
            line = line.strip()  # Remove leading/trailing whitespace
            if line.startswith("# sent_id"):  # New sentence begins
                if current_sentence:  # If there are tokens in the current sentence, add it to sentences list
                    sentences.append(current_sentence)
                    current_sentence = []  # Reset current sentence list
            elif line:  # If line is not empty
                if not line.startswith("#"):  # Ignore comment lines
                    # Split line by tab and extract word, tag, and other information
                    parts = line.split(" ")
                    word = parts[1]
                    tag = parts[3]
                    vocab.add(word)  # Add word to vocab set
                    pos_tags.add(tag)  # Add tag to pos_tags set
                    current_sentence.append((word, tag))  # Append (word, tag) tuple to current sentence
        if current_sentence:  # Append the last sentence if there are tokens
            sentences.append(current_sentence)
    return sentences, vocab, pos_tags
# Example usage:
file_path = "./train.txt"
parsed_data, vocab, pos_tags = parse_training_data(file_path)
print("Vocabulary:", vocab)
print("POS Tags:", pos_tags)


POS Tags: {'RBS', 'VBZ', 'TO', "''", 'DT', 'WRB', 'SYM', '``', '.', '$', 'JJS', '_', 'CD', 'POS', 'NNS', 'VB', 'VBD', ':', 'RBR', 'WP$', 'WP', 'HYPH', 'FW', 'MD', 'IN', 'EX', 'GW', 'JJ', 'CC', 'PRP$', 'VBN', 'WDT', 'PDT', 'LS', '-RRB-', 'VBG', 'UH', ',', 'NN', 'NNPS', 'NNP', 'PRP', '-LRB-', 'VBP', 'RB', 'JJR', 'RP'}


In [23]:
pos_tags=list(pos_tags)
vocab=list(vocab)
print(len(vocab))
print(len(pos_tags))

6782
47


In [24]:
# make tags * tags matrix
# make tags * vocab matrix
tag_tag_counts={}
for tag in pos_tags:
    tag_tag_counts[tag]={}
    for taggy in pos_tags:
        tag_tag_counts[tag][taggy]=0

tag_vocab_counts={}
for tag in pos_tags:
    tag_vocab_counts[tag]={}
    for voccy in vocab:
        tag_vocab_counts[tag][voccy]=0

tag_freq = {}
start_tags = {}
num_sent = 0
for tag in pos_tags:
    tag_freq[tag]=0
    start_tags[tag]=0

In [25]:
# tag_tag_counts[a][b] means count of a,b
# tag_vocab_counts[a][b] means count of word b given tag a
for sentence in parsed_data:
    num_sent += 1
    word1 = sentence[0][0]
    tag1 = sentence[0][1]
    tag_freq[tag1] += 1
    start_tags[tag1] += 1
    tag_vocab_counts[tag1][word1] += 1
    for i in range(1, len(sentence)):
        word2 = sentence[i][0]
        tag2 = sentence[i][1]
        tag_freq[tag2] += 1
        tag_tag_counts[tag1][tag2] += 1
        tag_vocab_counts[tag2][word2] += 1
        tag1 = tag2

In [26]:
def transition_prob(tag1, tag2, tag_tag_counts = tag_tag_counts):
        c_t1_t2 = tag_tag_counts[tag1][tag2]
        c_t1 = tag_freq[tag1]
        return c_t1_t2/c_t1

def emission_prob(tag, word, tag_vocab_counts = tag_vocab_counts, vocab = vocab, do_smoothing = False):
        c_t = tag_freq[tag]
        if not do_smoothing:
                c_w_t = tag_vocab_counts[tag][word]
                return c_w_t/c_t
        if word in vocab:
                c_w_t = tag_vocab_counts[tag][word]+1
        else:
                c_w_t = 1
        return c_w_t / (c_t + len(vocab))

In [27]:
trans_probs={}
for tag in pos_tags:
    trans_probs[tag]={}
    for taggy in pos_tags:
        trans_probs[tag][taggy]=transition_prob(tag,taggy)

emmi_probs={}
for tag in pos_tags:
    emmi_probs[tag]={}
    for voccy in vocab:
        emmi_probs[tag][voccy]=emission_prob(tag,voccy)

In [28]:
def viterbi(sentence, trans_prob = trans_probs, emmi_prob = emmi_probs, pos_tags = pos_tags, start_tags = start_tags, num_sent = num_sent):
    T = len(pos_tags)
    W = len(sentence)
    Score = [[0 for _ in range(W)] for _ in range(T)]
    BackPtr = [[0 for _ in range(W)] for _ in range(T)]
    Seq = [0 for _ in range(W)]
    for t in range(T):
        Score[t][0] = emmi_prob[pos_tags[t]][sentence[0]] * (start_tags[pos_tags[t]] / num_sent)
        BackPtr[t][0]=-1
    for w in range(1,W):
        for t in range(T):
            maxtrans=0
            k=-1
            for j in range(T):
                tmaxtrans=Score[j][w-1]*trans_prob[pos_tags[j]][pos_tags[t]]
                if tmaxtrans>maxtrans:
                    maxtrans=tmaxtrans
                    k=j
            Score[t][w]=emmi_prob[pos_tags[t]][sentence[w]]*maxtrans
            BackPtr[t][w]=k
    max_t = -1
    max_score = 0
    for t in range(T):
        if Score[t][W-1] >= max_score:
            max_t = t
            max_score = Score[t][W-1]
    Seq[W-1] = max_t
    for w in range(W-2, -1, -1):
        Seq[w] = BackPtr[Seq[w+1]][w+1]
    pos_tagging = []
    for t in Seq:
        pos_tagging.append(pos_tags[t])
    return pos_tagging


In [29]:
predicted_tags = []
gold_std_tags = []
for i in range(len(parsed_data)):
    s = []
    for w in parsed_data[i]:
        gold_std_tags.append(w[1])
        s.append(w[0])
    prediction = viterbi(s)
    for pred_tag in prediction:
        predicted_tags.append(pred_tag)

In [30]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(gold_std_tags, predicted_tags)

# Calculate precision
precision = precision_score(gold_std_tags, predicted_tags, average='weighted',zero_division=0)

# Calculate recall
recall = recall_score(gold_std_tags, predicted_tags, average='weighted',zero_division=0)

# Calculate F1 score
f1 = f1_score(gold_std_tags, predicted_tags, average='weighted',zero_division=0)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Accuracy: 0.9807964698191327
Precision: 0.9810973657433228
Recall: 0.9807964698191327
F1 Score: 0.9808747679691358


In [31]:
test_file_path = "./test.txt"

def parse_test_data(file_path):
    sentences = []  # List to store parsed sentences
    with open(file_path, 'r', encoding='utf-8') as file:
        current_sentence = []

        for line in file:
            line = line.strip()

            if line.startswith("# sent_id"):  # New sentence begins
                if current_sentence:
                    sentences.append(current_sentence)
                    current_sentence = []

            elif line:
                if not line.startswith("#"):
                    parts = line.split(" ")
                    word = parts[1]
                    tag = parts[3]
                    current_sentence.append((word, tag))

        if current_sentence:
            sentences.append(current_sentence)

    return sentences

test_data = parse_test_data(test_file_path)

In [32]:
# use test data to predict tags
predicted_tags = []
gold_std_tags = []

for i in range(len(test_data)):
    s = []

    for w in test_data[i]:
        s.append(w[0])
        gold_std_tags.append(w[1])
    prediction = viterbi(s)
    for pred_tag in prediction:
        predicted_tags.append(pred_tag)

print(len(gold_std_tags))
print(len(predicted_tags))

KeyError: 'prevalence'

In [None]:
accuracy = accuracy_score(gold_std_tags, predicted_tags)
precision = precision_score(gold_std_tags, predicted_tags, average='weighted',zero_division=0)
recall = recall_score(gold_std_tags, predicted_tags, average='weighted',zero_division=0)
f1 = f1_score(gold_std_tags, predicted_tags, average='weighted',zero_division=0)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
