In [1]:
from collections import defaultdict, Counter
import numpy as np
import pandas as pd
from conllu import parse_incr

In [2]:
train = open("data/train.conllu", "r", encoding="utf-8")
test = open("data/test.conllu", "r", encoding="utf-8")
val = open("data/val.conllu", "r", encoding="utf-8")
tags = ['START', 'O', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'B-LOC', 'B-MISC', 'B-PER', 'B-ORG', 'END']

In [3]:
def get_transition_matrix(tags, train):
    transition_matrix = np.zeros((len(tags), len(tags)), dtype='float32')

    tag_counter = defaultdict(int)
    transition_counter = defaultdict(int)

    for sentence in parse_incr(train):
        # count first tag of sentence and match it with 'START' artificial tag
        first_tag = sentence[0]['lemma']
        tag_counter['START'] += 1
        transition_counter[('START', first_tag)] += 1

        # count middle token pairs
        for (token_a, token_b) in zip(sentence, sentence[1:]):
            tag_counter[token_a['lemma']] += 1
            transition_counter[(token_a['lemma'], token_b['lemma'])] += 1

        # count last tag of sentence and match it with 'END' artificial tag
        last_tag = sentence[-1]['lemma']
        tag_counter[last_tag] += 1
        transition_counter[(last_tag, 'END')] += 1

    for i, t1 in enumerate(tags):
        for j, t2 in enumerate(tags):
            if tag_counter[t1]: # check if tag occurs at least once
                transition_matrix[i][j] =  transition_counter[(t1,t2)]/tag_counter[t1] # compute transition probability
    
    train.seek(0)
    return transition_matrix

In [4]:
# transition_matrix = get_transition_matrix(tags, train)
transition_matrix = pd.DataFrame(get_transition_matrix(tags, train), columns = tags, index=tags)
transition_matrix

Unnamed: 0,START,O,I-LOC,I-MISC,I-ORG,I-PER,B-LOC,B-MISC,B-PER,B-ORG,END
START,0.0,0.915552,0.0,0.0,0.0,0.0,0.023156,0.012263,0.038115,0.010915,0.0
O,0.0,0.874402,0.0,0.001979,0.0,0.0,0.027177,0.017658,0.018936,0.011958,0.04789
I-LOC,0.0,0.754676,0.244685,0.0,0.0,0.0,0.000213,5.3e-05,0.00032,5.3e-05,0.0
I-MISC,0.0,0.462849,3e-05,0.536522,3e-05,6e-05,0.0,0.000509,0.0,0.0,0.0
I-ORG,0.0,0.586095,0.0,0.0,0.413267,0.0,0.0,0.0,0.000245,0.000392,0.0
I-PER,0.0,0.876705,0.0,3.4e-05,0.0,0.123159,0.0,0.0,0.000102,0.0,0.0
B-LOC,0.0,0.739153,0.258169,0.0,0.0,5.5e-05,0.002441,0.0,0.000182,0.0,0.0
B-MISC,0.0,0.654785,0.0,0.321987,0.0,2.8e-05,0.0,0.022703,0.000498,0.0,0.0
B-PER,0.0,0.358111,0.0,0.0,0.0,0.641541,0.0,0.0,0.000348,0.0,0.0
B-ORG,0.0,0.503447,0.0,0.0,0.493953,0.0,0.0,0.0,0.000454,0.002147,0.0


In [5]:
def get_emission_probabilities(train):
    word_tag_count = defaultdict(int)
    tag_counter = defaultdict(int)

    for sentence in parse_incr(train):
        for token in sentence:
            word_tag_count[(token['form'], token['lemma'])] += 1
            tag_counter[token['lemma']] += 1
    
    emission_probabilities = {(word, tag): count/tag_counter[tag] for (word, tag), count in word_tag_count.items()} # compute emission probability
    return emission_probabilities

def get_emission_matrix(tags, words, emission_probabilities):
    emission_matrix = np.zeros((len(tags), len(words)), dtype='float32')

    for i, tag in enumerate(tags):
        for j, word in enumerate(words):
            emission_matrix[i, j] = emission_probabilities.get((word, tag), 0)
            
    return emission_matrix

In [6]:
emission_probabilities = get_emission_probabilities(train)
train.close()