In [1]:
import numpy as np
np.seterr(under='raise')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [2]:
from collections import Counter, deque
import numpy as np
from collections import Counter


def make_sequences(path, prj_idcs):
    sequences = []
    with open(path, 'r') as f:
        sequence = []
        for line in f:
            line = line.strip()
            if line:
                line = line.split()
                sequence.append(tuple([line[i] for i in prj_idcs]) if len(
                    prj_idcs) > 1 else line[prj_idcs[0]])
            else:
                sequences.append(tuple(sequence))
                sequence = []
    return sequences


def make_ngrams(sequences, n, n_edge_marks=0):
    """
    produce ngrams from a sequence.

    Parameters
    ----------
    sequence : iterable
        Input sequence.
    n : int
        size of the ngrams.
    n_edge_marks : int, optional
        number of BOS and EOS marks to surround the sequence with

    Yields
    ------
    ngram : tuple
    """
    if n <= 0:
        raise ValueError('ngram size <= 0 is invalid.')
    if n_edge_marks < 0:
        raise ValueError('n_edge_marks < 0 is invalid.')

    for sequence in sequences:
        sequence = tuple(['<BOS>']*(n_edge_marks)) + \
            sequence + \
            tuple(['<EOS>']*n_edge_marks)
        ngram = deque()
        for token in sequence[:n]:
            ngram.append(token)
        for token in sequence[n:]:
            yield tuple(ngram) if n > 1 else ngram[0]
            ngram.append(token)
            ngram.popleft()
        yield tuple(ngram) if n > 1 else ngram[0]


def count_ngrams(sequence, n, n_edge_marks):
    return Counter(make_ngrams(sequence, n, n_edge_marks))

from frozendict import frozendict
from itertools import chain

class X2ID:
    """
    maps item of type X to IDs and vice versa.
    only works in Python 3.7+.
    """

    def __init__(self, *containers, unknown=None):
        self.unknown = unknown
        self.d = frozendict({s: i for i, s in enumerate(chain(*containers))})

    def get_value(self, n):
        """
        This sucks. Needs C++ style iterators.
        """
        if n < 0:
            n += len(self.d)
        for i, key in enumerate(self.d.keys()):
            if i == n:
                return key
        raise IndexError("dictionary index out of range")

    def __getitem__(self, item):
        try:
            return self.get_value(item) if type(item) is int else self.d[item]
        except:
            return self.d[self.unknown]
        
        
from typing import Dict
import pandas as pd
from scipy.sparse import csr_matrix


def make_count_matrix(counts: Dict, smoothing_constant=0):

    row_indxs = []
    col_indxs = []
    dat_values = []

    for i, ((a, b), cnt) in enumerate(counts.items(), 1):

        col_indxs.append(a)
        row_indxs.append(b)
        dat_values.append(cnt)

    return csr_matrix((dat_values, (row_indxs, col_indxs))).toarray()+smoothing_constant

In [3]:
from scipy.special import logsumexp


def logdot(a, b):
    max_a, max_b = np.max(a), np.max(b)
    exp_a, exp_b = a - max_a, b - max_b
    np.exp(exp_a, out=exp_a)
    np.exp(exp_b, out=exp_b)
    c = np.dot(exp_a, exp_b)
    np.log(c, out=c)
    c += max_a + max_b
    return c


# https://www.informatik.uni-leipzig.de/~droste/papers/Droste-Stueber-Vogler-final.pdf
class Bimonoid():
    def add(self, *args):
        """abstract addition"""
        raise NotImplementedError

    def zeros(self, *args):
        """identity for addition"""
        raise NotImplementedError

    def ones(self, *args):
        """identity for multiplication"""
        raise NotImplementedError

    def mul(self, a, b):
        """abstract multiplication"""
        raise NotImplementedError

    def scale(self, a):
        """maps `a` to the elements of the set
        the bimonoid is defined over."""
        raise NotImplementedError


class LogBimonoid:
    """`mul` is not distributive if `mul` is matrix
    logsumexp and `add` is elementwise addition. So
    this forms a bimonoid."""

    def add(self, *args, **kwargs):
        return np.add(*args, **kwargs)

    def zeros(self, *args):
        return np.zeros(*args)

    def ones(self, *args):
        return np.ones(*args)

    def mul(self, a, b):
        return logdot(a, b)

    def scale(self, a):
        return np.log(a)

In [4]:
# nsr = LogBimonoid()

# a = nsr.scale(np.random.random(size=(4,4)))
# b = nsr.scale(np.random.random(size=(4,4)))
# c = nsr.scale(np.random.random(size=(4,4)))

# # a * (b + c) /= ab + ac
# print(np.allclose(nsr.mul(a, nsr.add(b, c)), nsr.add(nsr.mul(a, b), nsr.mul(a, c))))
# # (b + c) * a /= ab + ac
# print(np.allclose(nsr.mul(nsr.add(b, c), a), nsr.add(nsr.mul(a, b), nsr.mul(a, c))))

In [31]:
import numpy as np
from numpy.random import dirichlet
import matplotlib.pyplot as plt
from functools import reduce


class HMM:
    def __init__(self, bimonoid=None, n_hidden=None, n_observed=None):
        if not bimonoid:
            bimonoid = LogBimonoid()
        self.__bimonoid = bimonoid
        if n_hidden:
            self.n_hidden = n_hidden
            self.__initial = self.bimonoid.scale(
                dirichlet(np.ones(n_hidden), size=1))
            # treat column index as FROM and row index as TO
            self.__transitions = self.bimonoid.scale(
                dirichlet(np.ones(n_hidden), size=n_hidden))
            if n_observed:
                self.__emissions = self.bimonoid.scale(dirichlet(
                    np.ones(n_hidden), size=n_observed))

    @property
    def bimonoid(self):
        return self.__bimonoid

    @property
    def initial(self):
        return self.__initial

    @initial.setter
    def initial(self, x):
        self.__initial = x

    @property
    def transitions(self):
        return self.__transitions

    @transitions.setter
    def transitions(self, x):
        assert(x.shape[0] == x.shape[1])
        self.n_hidden = x.shape[0]
        self.__transitions = x

    @property
    def emissions(self):
        return self.__emissions

    @emissions.setter
    def emissions(self, x):
        self.__emissions = x

    def alpha(self, idcs, t, j):
        return forward(idcs, t)[j]

    def forward_prob(*args, **kwargs):
        return np.sum(forward(*args, **kwargs)[-1])

    def forward(self, idcs, t=None):
        #         if not t:
        #             t = len(idcs)

        #         # stores probabilities for states at current timestep
        #         current = self.bimonoid.add(self.initials, self.emissions[idcs[0]])

        #         for idx in idcs[1:t]:
        #             self.bimonoid.add(self.bimonoid.mul(
        #                 self.transitions, current), self.emissions[idx], out=current)

        #         return current

        if not t:
            t = len(idcs)

        trellis = self.bimonoid.zeros((t, self.n_hidden))
        trellis[0] = self.bimonoid.add(self.initials, self.emissions[idcs[0]])

        for i, idx in enumerate(idcs[1:t]):
            trellis[i+1] = self.bimonoid.add(self.bimonoid.mul(
                self.transitions, trellis[i]), self.emissions[idx])

        return trellis

    def backward(self, idcs):
        pass

    def viterbi(self, idcs):

        # stores probabilities for states at current timestep
        trellis = self.bimonoid.zeros((len(idcs), self.n_hidden))
        trellis[0] = self.bimonoid.add(self.initials, self.emissions[idcs[0]])
        backpointers = self.bimonoid.zeros((len(idcs), self.n_hidden))

        for i, idx in enumerate(idcs[1:]):

            tmp = self.bimonoid.add(self.bimonoid.add(
                self.transitions, trellis[i]), self.emissions[idx][:, np.newaxis])

            trellis[i+1] = np.max(tmp, axis=1)
            backpointers[i+1] = np.argmax(tmp, axis=1)

        sq = []
        i = np.argmax(trellis[-1])
        for row in backpointers[::-1]:
            sq.append(i)
            i = int(row[i])

        return sq[::-1]

In [48]:
def f(acc, x):
    print(f'x : {x}')
    print(f'acc : {acc}')
    return ord(x)+acc
reduce(f, [0, 'a','b','c'])

x : a
acc : 0
x : b
acc : 97
x : c
acc : 195


294

97

In [6]:
path = 'data_tiger_annotated.txt'
tag_sqs = make_sequences(path, [1])
token_tag_sqs = make_sequences(path, [0, 1])
token_tag_sqs.append((('<ukn>', 'XY'),))

TgUgCnts = count_ngrams(tag_sqs, 1, 2)  # tag unigram counts
TgBgCnts = count_ngrams(tag_sqs, 2, 2)  # tag bigram counts
TkTgCnts = count_ngrams(token_tag_sqs, 1, 0)  # token-tag pair counts

tags = list(TgUgCnts.keys())
tokens = list(zip(*TkTgCnts.keys()))[0]

Tg2id = X2ID(tags)  # maps tags to IDs
Tk2id = X2ID(tokens, unknown='<ukn>')  # maps tags and tokens to IDs

# make new count dicts with IDs rather than strings
TgUgCnts = {Tg2id[tag]: cnt for tag, cnt in TgUgCnts.items()}
TgBgCnts = {tuple(Tg2id[tag] for tag in TgTuple)
                  : cnt for TgTuple, cnt in TgBgCnts.items()}
TkTgCnts = {(Tk2id[tkn], Tg2id[tag]): cnt for (
    tkn, tag), cnt in TkTgCnts.items()}

TgUgVec = np.array(list(TgUgCnts.values()))
TgBgMat = make_count_matrix(TgBgCnts, 1)
TkTgMat = make_count_matrix(TkTgCnts, 1).T  # tranpose, so tags are columns

initial_probs = TgUgVec/(np.sum(TgUgVec))
transition_probs = TgBgMat/(np.sum(TgBgMat, axis=0))
emission_probs = TkTgMat/(np.sum(TkTgMat, axis=0))

In [29]:
hmm = HMM()

hmm.initials = np.log(initial_probs)
hmm.transitions = np.log(transition_probs)
hmm.emissions = np.log(emission_probs)

sentence = 'Der Viterbi-Algorithmus ist ein Algorithmus der dynamischen Programmierung zur Bestimmung der wahrscheinlichsten Sequenz von verborgenen Zuständen bei einem gegebenen Hidden Markov Model (HMM) und einer beobachteten Sequenz von Symbolen .'
sentence = sentence.split()
[(Tg2id[int(s)], w) for s, w in zip(hmm.viterbi([Tk2id[w] for w in sentence]), sentence)]

14
[ 0.  2.  2.  2.  2.  7.  5.  5.  2.  2.  7.  7.  7.  7.  2. 15.  5.  2.
  2.  4.  7.  2.  2.  7.  7.  1.  7.  5.  7.  5. 36. 31. 36. 24. 27. 27.
 22.  7. 24.  1.  7.  1. 31. 24.  5. 16. 24. 16.  5. 24. 31.  5.  1. 24.
 24.]
2
[10. 10. 10.  2. 10. 10. 10. 10.  8.  2. 10.  2. 10. 10. 10. 10. 10. 10.
 10. 10. 10.  2.  2. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10.
 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10. 10.
 10.]
10
[ 0.  7.  7.  7.  7.  7.  5.  5. 14.  7.  7.  7. 10.  7.  7. 15.  7.  7.
  7. 22.  7.  7.  7.  7. 10. 22.  7.  7.  7. 10.  9. 10.  9.  1. 10.  7.
 22.  7. 10. 22.  7. 22.  1. 22. 10. 13. 20. 13.  7.  1.  7.  5.  1. 37.
  7.]
7
[ 0.  7.  7.  7.  7.  7.  6.  6.  7.  7.  7.  7.  7.  7.  7. 15.  7.  7.
  7.  1.  7.  7.  7.  7.  7.  1.  7.  7.  7.  7.  7.  7.  1.  6.  1.  7.
 10.  7. 10.  1.  7.  1.  1. 10.  6.  7. 26.  7.  7.  1.  7.  6.  1.  6.
  7.]
6
[ 5.  5.  5. 29.  5. 29.  5.  5.  8. 29.  5. 29.  5. 29. 29.  5.  5.  5.
  5.  5. 29. 29

[('ART', 'Der'),
 ('NN', 'Viterbi-Algorithmus'),
 ('VAFIN', 'ist'),
 ('ART', 'ein'),
 ('NN', 'Algorithmus'),
 ('ART', 'der'),
 ('ADJA', 'dynamischen'),
 ('NN', 'Programmierung'),
 ('APPRART', 'zur'),
 ('NN', 'Bestimmung'),
 ('ART', 'der'),
 ('ADJA', 'wahrscheinlichsten'),
 ('NN', 'Sequenz'),
 ('APPR', 'von'),
 ('ART', 'verborgenen'),
 ('NN', 'Zuständen'),
 ('APPR', 'bei'),
 ('ART', 'einem'),
 ('ADJA', 'gegebenen'),
 ('NN', 'Hidden'),
 ('APPR', 'Markov'),
 ('ART', 'Model'),
 ('NN', '(HMM)'),
 ('KON', 'und'),
 ('ART', 'einer'),
 ('ADJA', 'beobachteten'),
 ('NN', 'Sequenz'),
 ('APPR', 'von'),
 ('NE', 'Symbolen'),
 ('$.', '.')]

In [25]:
hmm = HMM()

hmm.initials = np.log(initial_probs)
hmm.transitions = np.log(transition_probs)
hmm.emissions = np.log(emission_probs)

sentences = [[Tk2id[tkn] for tkn in sentence] for sentence in make_sequences(path, [0])][:10000]

def f():
    for i, sentence in enumerate(sentences):
        hmm.forward(sentence)
        print(i, end='\r')
        
%time f()

CPU times: user 26.8 s, sys: 1.14 s, total: 28 s
Wall time: 27.2 s


In [9]:
np.sum([len(s) for s in sentences])/21

8551.47619047619

In [10]:
len(sentences)/20

500.0