In [1]:
import numpy as np
np.seterr(under='raise')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [2]:
from collections import Counter, deque
import numpy as np
from collections import Counter


def make_sequences(path, prj_idcs):
    sequences = []
    with open(path, 'r') as f:
        sequence = []
        for line in f:
            line = line.strip()
            if line:
                line = line.split()
                sequence.append(tuple([line[i] for i in prj_idcs]) if len(
                    prj_idcs) > 1 else line[prj_idcs[0]])
            else:
                sequences.append(tuple(sequence))
                sequence = []
    return sequences


def make_ngrams(sequences, n, n_edge_marks=0):
    """
    produce ngrams from a sequence.

    Parameters
    ----------
    sequence : iterable
        Input sequence.
    n : int
        size of the ngrams.
    n_edge_marks : int, optional
        number of BOS and EOS marks to surround the sequence with

    Yields
    ------
    ngram : tuple
    """
    if n <= 0:
        raise ValueError('ngram size <= 0 is invalid.')
    if n_edge_marks < 0:
        raise ValueError('n_edge_marks < 0 is invalid.')

    for sequence in sequences:
        sequence = tuple(['<BOS>']*(n_edge_marks)) + \
            sequence + \
            tuple(['<EOS>']*n_edge_marks)
        ngram = deque()
        for token in sequence[:n]:
            ngram.append(token)
        for token in sequence[n:]:
            yield tuple(ngram) if n > 1 else ngram[0]
            ngram.append(token)
            ngram.popleft()
        yield tuple(ngram) if n > 1 else ngram[0]


def count_ngrams(sequence, n, n_edge_marks):
    return Counter(make_ngrams(sequence, n, n_edge_marks))

from frozendict import frozendict
from itertools import chain

class X2ID:
    """
    maps item of type X to IDs and vice versa.
    only works in Python 3.7+.
    """

    def __init__(self, *containers, unknown=None):
        self.unknown = unknown
        self.d = frozendict({s: i for i, s in enumerate(chain(*containers))})

    def get_value(self, n):
        """
        This sucks. Needs C++ style iterators.
        """
        if n < 0:
            n += len(self.d)
        for i, key in enumerate(self.d.keys()):
            if i == n:
                return key
        raise IndexError("dictionary index out of range")

    def __getitem__(self, item):
        try:
            return self.get_value(item) if type(item) is int else self.d[item]
        except:
            return self.d[self.unknown]
        
        
from typing import Dict
import pandas as pd
from scipy.sparse import csr_matrix


def make_count_matrix(counts: Dict, smoothing_constant=0):

    row_indxs = []
    col_indxs = []
    dat_values = []

    for i, ((a, b), cnt) in enumerate(counts.items(), 1):

        col_indxs.append(a)
        row_indxs.append(b)
        dat_values.append(cnt)

    return csr_matrix((dat_values, (row_indxs, col_indxs))).toarray()+smoothing_constant

In [27]:
def logdot(a, b):
    max_a, max_b = np.max(a), np.max(b)
    exp_a, exp_b = a - max_a, b - max_b
    np.exp(exp_a, out=exp_a)
    np.exp(exp_b, out=exp_b)
    c = np.dot(exp_a, exp_b)
    np.log(c, out=c)
    c += max_a + max_b
    return c

class Semiring():
    pass

class LogSemiring(Semiring):
    def mul(self, *args):
        return np.add(*args)
    def zeros(self, *args):
        return np.ones(*args)
    def dot(self, a, b):
        return logdot(a, b)
    def scale(self, a):
        return np.log(a)

In [32]:
import numpy as np
from numpy.random import dirichlet
import matplotlib.pyplot as plt


class HMM:
    def __init__(self, semiring=None, n_hidden=None, n_observed=None):
        if not semiring:
            semiring = LogSemiring()
        self.__semiring = semiring
        if n_hidden:
            self.n_hidden = n_hidden
            self.__initial = self.semiring.scale(
                dirichlet(np.ones(n_hidden), size=1))
            # treat column index as FROM and row index as TO
            self.__transitions = self.semiring.scale(
                dirichlet(np.ones(n_hidden), size=n_hidden))
            if n_observed:
                self.__emissions = self.semiring.scale(dirichlet(
                    np.ones(n_hidden), size=n_observed))

    @property
    def semiring(self):
        return self.__semiring

    @property
    def initial(self):
        return self.__initial

    @initial.setter
    def initial(self, x):
        self.__initial = x

    @property
    def transitions(self):
        return self.__transitions

    @transitions.setter
    def transitions(self, x):
        assert(x.shape[0] == x.shape[1])
        self.n_hidden = x.shape[0]
        self.__transitions = x

    @property
    def emissions(self):
        return self.__emissions

    @emissions.setter
    def emissions(self, x):
        self.__emissions = x

    def forward(self, idcs):

        # stores probabilities for states at current timestep
        trellis = self.semiring.zeros((len(idcs), self.n_hidden))
        trellis[0] = self.semiring.mul(self.initials, self.emissions[idcs[0]])

        for i, idx in enumerate(idcs[1:]):
            trellis[i+1] = self.semiring.mul(self.semiring.dot(
                self.transitions, trellis[i]), self.emissions[idx])

        return trellis

    def viterbi(self, idcs):

        # stores probabilities for states at current timestep
        trellis = self.semiring.zeros((len(idcs), self.n_hidden))
        trellis[0] = self.semiring.mul(self.initials, self.emissions[idcs[0]])

        for i, idx in enumerate(idcs[1:]):
            trellis[i+1] = np.max(
                self.semiring.mul(
                    self.semiring.mul(
                        self.transitions, trellis[i]),
                    self.emissions[idx][:, np.newaxis]),
                axis=1)

        return np.argmax(trellis, axis=1)

In [4]:
import numpy as np

def logdot(a, b):
    max_a, max_b = np.max(a), np.max(b)
    exp_a, exp_b = a - max_a, b - max_b
    np.exp(exp_a, out=exp_a)
    np.exp(exp_b, out=exp_b)
    c = np.dot(exp_a, exp_b)
    np.log(c, out=c)
    c += max_a + max_b
    return c

a = np.random.random(size=(3,3))
b = np.random.random(size=(3,3))

print(a)
print(b)

np.allclose(logdot(np.log(a), np.log(b)), np.log(np.dot(a, b)))

[[0.91359743 0.63755533 0.63544169]
 [0.71148476 0.84102533 0.37839201]
 [0.85545964 0.60684272 0.45200178]]
[[0.16522363 0.37474642 0.32547586]
 [0.46284152 0.17015725 0.4010522 ]
 [0.26523095 0.72762166 0.36390184]]


True

In [5]:
path = 'data_tiger_annotated.txt'
tag_sqs = make_sequences(path, [1])
token_tag_sqs = make_sequences(path, [0, 1])
token_tag_sqs.append((('<ukn>', 'XY'),))


TgUgCnts = count_ngrams(tag_sqs, 1, 2)  # tag unigram counts
TgBgCnts = count_ngrams(tag_sqs, 2, 2)  # tag bigram counts
TkTgCnts = count_ngrams(token_tag_sqs, 1, 0)  # token-tag pair counts

tags = list(TgUgCnts.keys())
tokens = list(zip(*TkTgCnts.keys()))[0]

Tg2id = X2ID(tags)  # maps tags to IDs
Tk2id = X2ID(tokens, unknown='<ukn>')  # maps tags and tokens to IDs

# make new count dicts with IDs rather than strings
TgUgCnts = {Tg2id[tag]: cnt for tag, cnt in TgUgCnts.items()}
TgBgCnts = {tuple(Tg2id[tag] for tag in TgTuple)
                  : cnt for TgTuple, cnt in TgBgCnts.items()}
TkTgCnts = {(Tk2id[tkn], Tg2id[tag]): cnt for (
    tkn, tag), cnt in TkTgCnts.items()}

TgUgVec = np.array(list(TgUgCnts.values()))
TgBgMat = make_count_matrix(TgBgCnts, 1)
TkTgMat = make_count_matrix(TkTgCnts, 1).T  # tranpose, so tags are columns

initial_probs = TgUgVec/(np.sum(TgUgVec))
transition_probs = TgBgMat/(np.sum(TgBgMat, axis=0))
emission_probs = TkTgMat/(np.sum(TkTgMat, axis=0))

In [33]:
hmm = HMM()

hmm.initials = np.log(initial_probs)
hmm.transitions = np.log(transition_probs)
hmm.emissions = np.log(emission_probs)

sentence = 'Der Viterbi-Algorithmus ist ein Algorithmus der dynamischen Programmierung zur Bestimmung der wahrscheinlichsten Sequenz von verborgenen Zuständen bei einem gegebenen Hidden Markov Model (HMM) und einer beobachteten Sequenz von Symbolen .'
sentence = sentence.split()
[(Tg2id[int(s)], w) for s, w in zip(hmm.viterbi([Tk2id[w] for w in sentence]), sentence)]

[('ART', 'Der'),
 ('NN', 'Viterbi-Algorithmus'),
 ('VAFIN', 'ist'),
 ('ART', 'ein'),
 ('NN', 'Algorithmus'),
 ('ART', 'der'),
 ('ADJA', 'dynamischen'),
 ('NN', 'Programmierung'),
 ('APPRART', 'zur'),
 ('NN', 'Bestimmung'),
 ('ART', 'der'),
 ('NN', 'wahrscheinlichsten'),
 ('NN', 'Sequenz'),
 ('APPR', 'von'),
 ('ART', 'verborgenen'),
 ('NN', 'Zuständen'),
 ('APPR', 'bei'),
 ('ART', 'einem'),
 ('ADJA', 'gegebenen'),
 ('NN', 'Hidden'),
 ('APPR', 'Markov'),
 ('<EOS>', 'Model'),
 ('<EOS>', '(HMM)'),
 ('KON', 'und'),
 ('ART', 'einer'),
 ('NN', 'beobachteten'),
 ('NN', 'Sequenz'),
 ('APPR', 'von'),
 ('ART', 'Symbolen'),
 ('$.', '.')]

In [34]:
sentences = [[Tk2id[tkn] for tkn in sentence] for sentence in make_sequences(path, [0])]

def f():
    for i, sentence in enumerate(sentences):
        hmm.viterbi(sentence)
        print(i, end='\r')
        
%time f()

CPU times: user 1min 10s, sys: 5.18 s, total: 1min 15s
Wall time: 1min 13s


In [8]:
np.sum([len(s) for s in sentences])/72

11179.236111111111

In [9]:
len(sentences)/72

638.5416666666666