# Imports

In [None]:
import regex as re
import unicodedata
from glob import glob
from collections import Counter
from operator import itemgetter
import joblib
import numpy as np
from typing import List

# Functions

In [None]:
def tokenize(text: str) -> List[str]:
    # text = clean(text.lower())
    rx = re.compile(r"\b\p{L}[\p{L}\p{M}\p{N}'’-]*\b", re.UNICODE)
    return rx.findall(text)

# Preprocess

In [3]:
counter = Counter()

for f in glob('../data/raw/*.txt'):
    for l in open(f, 'r').readlines():
        counter.update(tokenize(l))

n_most_common_wordtypes = 100
n_features = n_most_common_wordtypes + 2
most_common_wordtypes = list(map(itemgetter(0), counter.most_common(n_most_common_wordtypes))) + ['<BOS>', '<EOS>']
feature_to_ix = dict(zip(most_common_wordtypes, range(len(most_common_wordtypes))))
wordtype_to_ix = dict(zip(counter.keys(), range(len(counter.keys()))))
ix_to_wordtype = dict(zip(wordtype_to_ix.values(), wordtype_to_ix.keys()))
n_wordtypes = len(wordtype_to_ix)

## Featurise

In [4]:
# extra dimension to count ignored words. to be removed later.
x_wordtype_counts_left = np.zeros((n_wordtypes, n_features + 1)).astype(int)
x_wordtype_counts_right = np.zeros((n_wordtypes, n_features + 1)).astype(int)

for f in glob('../data/raw/*.txt'):
    for l in open(f, 'r').readlines():
        tokens = ['<BOS>'] + tokenize(l) + ['<EOS>']
        for ix in range(1, len(tokens) - 1):
            x_wordtype_counts_left[wordtype_to_ix[tokens[ix]], feature_to_ix.get(tokens[ix-1], -1)] += 1
            x_wordtype_counts_right[wordtype_to_ix[tokens[ix]], feature_to_ix.get(tokens[ix+1], -1)] += 1
            # print(words[ix-1], words[ix], words[ix+1])

# remove ignored words
x_wordtype_counts_left = x_wordtype_counts_left[:, :n_features]  # M (n_wordtypes) x F (n_features)
x_wordtype_counts_right = x_wordtype_counts_right[:, :n_features]  # M (n_wordtypes) x F (n_features)

# x_wordtype_counts_sum_left = x_wordtype_counts_left.sum(axis=1)  # M (n_wordtypes)
# x_wordtype_counts_sum_right = x_wordtype_counts_right.sum(axis=1)  # M (n_wordtypes)

assert x_wordtype_counts_left.shape == x_wordtype_counts_right.shape == (n_wordtypes, n_features)
assert x_wordtype_counts_right.shape == x_wordtype_counts_right.shape == (n_wordtypes, n_features)
# assert x_wordtype_counts_sum_left.shape == (n_wordtypes,)
# assert x_wordtype_counts_sum_right.shape == (n_wordtypes,)

# Export artifacts

In [6]:
path_output = 'outputs/preprocess'
!mkdir -p {path_output}
_ = joblib.dump(wordtype_to_ix, f'{path_output}/wordtype_to_ix.joblib')
_ = joblib.dump(ix_to_wordtype, f'{path_output}/ix_to_wordtype.joblib')
_ = joblib.dump(counter, f'{path_output}/counter.joblib')
_ = joblib.dump(x_wordtype_counts_left, f'{path_output}/x_wordtype_counts_left.joblib')
_ = joblib.dump(x_wordtype_counts_right, f'{path_output}/x_wordtype_counts_right.joblib')