In [1]:
import regex as re
import unicodedata
from glob import glob

def tokenize(text):
    # text = clean(text.lower())
    rx = re.compile(r"\b\p{L}[\p{L}\p{M}\p{N}'’-]*\b", re.UNICODE)
    return rx.findall(text)

In [3]:
from collections import Counter
from operator import itemgetter

counter = Counter()

for f in glob('../data/raw/*.txt'):
    for l in open(f, 'r').readlines():
        counter.update(tokenize(l))

n_most_common_wordtypes = 100
n_features = n_most_common_wordtypes + 2
most_common_wordtypes = list(map(itemgetter(0), counter.most_common(n_most_common_wordtypes))) + ['<BOS>', '<EOS>']
feature_to_ix = dict(zip(most_common_wordtypes, range(len(most_common_wordtypes))))
wordtype_to_ix = dict(zip(counter.keys(), range(len(counter.keys()))))
n_wordtypes = len(wordtype_to_ix)

In [4]:
import numpy as np

# extra dimension to count ignored words. to be removed later.
x_wordtype_counts_left = np.zeros((n_wordtypes, n_features + 1)).astype(int)
x_wordtype_counts_right = np.zeros((n_wordtypes, n_features + 1)).astype(int)

for f in glob('../data/raw/*.txt'):
    for l in open(f, 'r').readlines():
        tokens = ['<BOS>'] + tokenize(l) + ['<EOS>']
        for ix in range(1, len(tokens) - 1):
            x_wordtype_counts_left[wordtype_to_ix[tokens[ix]], feature_to_ix.get(tokens[ix-1], -1)] += 1
            x_wordtype_counts_right[wordtype_to_ix[tokens[ix]], feature_to_ix.get(tokens[ix+1], -1)] += 1
            # print(words[ix-1], words[ix], words[ix+1])

# remove ignored words
x_wordtype_counts_left = x_wordtype_counts_left[:, :n_features]
x_wordtype_counts_right = x_wordtype_counts_right[:, :n_features]

x_wordtype_counts_sum_left = x_wordtype_counts_left.sum(axis=1)
x_wordtype_counts_sum_right = x_wordtype_counts_right.sum(axis=1)

In [5]:
z = 10

x_class_priors = np.array([1/z] * z) # shape: (M,)
x_wordtype_class_assignments = np.random.choice(range(len(x_class_priors)), p=x_class_priors, size=n_wordtypes) # shape: (M,)
x_class_counts = np.bincount(x_wordtype_class_assignments) # shape: (Z,)

x_class_priors.shape, x_wordtype_class_assignments.shape, x_class_counts.shape, x_wordtype_counts_left.shape, x_wordtype_counts_right.shape

((10,), (108501,), (10,), (108501, 102), (108501, 102))

In [6]:
x_class_wordtype_counts_left = np.zeros((z, n_features)).astype(int)
x_class_wordtype_counts_right = np.zeros((z, n_features)).astype(int)
np.add.at(x_class_wordtype_counts_left, x_wordtype_class_assignments, x_wordtype_counts_left)
np.add.at(x_class_wordtype_counts_right, x_wordtype_class_assignments, x_wordtype_counts_right)
x_class_wordtype_counts_sum_left = x_class_wordtype_counts_left.sum(axis=1)
x_class_wordtype_counts_sum_right = x_class_wordtype_counts_right.sum(axis=1)

In [13]:
from scipy.special import gammaln


alpha = .5
betas = np.array([.5, .5])
n_iterations = 1

for ix_iteration in range(n_iterations):
    for ix_wordtype in range(n_wordtypes):
    # for ix_wordtype in range(n_wordtypes)[:1]:

        # --- remove word type assignment
        # get word type class assignment
        z_old = x_wordtype_class_assignments[ix_wordtype]
        
        # decrement class count
        x_class_counts[z_old] -= 1
        
        # decrement class word type counts
        x_class_wordtype_counts_left[z_old] -= x_wordtype_counts_left[ix_wordtype]
        x_class_wordtype_counts_right[z_old] -= x_wordtype_counts_right[ix_wordtype]
        # decrement from class totals
        x_class_wordtype_counts_sum_left[z_old] -= x_wordtype_counts_sum_left[ix_wordtype]
        x_class_wordtype_counts_sum_right[z_old] -= x_wordtype_counts_sum_right[ix_wordtype]

        # --- recompute posterior
        log_scores = np.log(x_class_counts + alpha) # drop denominator since its common to all classes.
        # left context features
        log_scores += (
            (gammaln(x_class_wordtype_counts_left + x_wordtype_counts_left[ix_wordtype] + betas[0]) - gammaln(x_class_wordtype_counts_left + betas[0])).sum(axis=1)
            - (gammaln(x_class_wordtype_counts_sum_left + x_wordtype_counts_sum_left[ix_wordtype] + n_features * betas[0]) - gammaln(x_class_wordtype_counts_sum_left + n_features * betas[0]))
        )
        # right context features
        log_scores += (
            (gammaln(x_class_wordtype_counts_right + x_wordtype_counts_right[ix_wordtype] + betas[0]) - gammaln(x_class_wordtype_counts_right + betas[0])).sum(axis=1)
            - (gammaln(x_class_wordtype_counts_sum_right + x_wordtype_counts_sum_right[ix_wordtype] + n_features * betas[0]) - gammaln(x_class_wordtype_counts_sum_right + n_features * betas[0]))
        )

        # --- sample new assignment
        z_new = _sample_from_log_probs(log_scores)
        # increment class count
        x_class_counts[z_new] += 1
        # increment class word type counts
        x_class_wordtype_counts_left[z_new] += x_wordtype_counts_left[ix_wordtype]
        x_class_wordtype_counts_right[z_new] += x_wordtype_counts_right[ix_wordtype]
        # increment from class totals
        x_class_wordtype_counts_sum_left[z_new] += x_wordtype_counts_sum_left[ix_wordtype]
        x_class_wordtype_counts_sum_right[z_new] += x_wordtype_counts_sum_right[ix_wordtype]
        

In [12]:
def _sample_from_log_probs(logp):
    """Sample a categorical outcome from unnormalized log-probabilities."""
    m = np.max(logp)
    p = np.exp(logp - m)
    p /= p.sum()
    return np.random.choice(len(logp), p=p)

_sample_from_log_probs(log_scores)

1