In [None]:
!pip install numpy pandas matplotlib scipy gensim spacy scikit-learn ipywidgets

In [None]:
import json

from pathlib import Path
from functools import partial
from collections import Counter
from bisect import bisect_left

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import coo_array
from tqdm.notebook import tqdm
from spacy.lang.de import German
from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from sklearn.metrics import pairwise_distances

import ipywidgets as widgets

In [None]:
def iter_docs(in_path):
    with open(in_path, "r") as in_file:
        for iline, line in tqdm(enumerate(in_file), desc="load documents"):
            data = json.loads(line)
            yield data
            if iline > 10 and DEBUG:
                break
            
            
def iter_sents(docs):
    for doc in docs:
        text = doc['text']
        for isent, sent in tqdm(enumerate(nlp(text).sents), desc="load sentences"):
            sentence = []
            for token in sent:
                sentence.append(token.text)
            yield {
                'text': sentence,
                'sentence_id': isent,
                **{k:v for k,v in doc.items() if k!='text'}
            }
            if isent > 100 and DEBUG:
                break
                
def iter_chunks(docs, n_tokens=100):
    for doc in docs:
        tokenized = [t.text for t in nlp(doc['text'])]
        for i in range(0, len(tokenized), n_tokens):
            yield {
                'text': tokenized[i:i + n_tokens],
                **{k:v for k,v in doc.items() if k!='text'}
            }
            if i//n_tokens>100 and DEBUG:
                break

In [None]:
preprocessed_data_fn = Path("processed_data") / "eltec.jsonl"
DEBUG = False

nlp = German()
nlp.add_pipe("sentencizer")

for doc in iter_docs(preprocessed_data_fn):
    if len(doc['text']) >= nlp.max_length:
        nlp.max_length = len(doc['text'])+1

## One-Hot

In [None]:
vocab, counts = zip(
    *Counter(
        [token.text for doc in iter_docs(preprocessed_data_fn) for token in nlp(doc['text'])]
    ).most_common(10000)
)
vocab = sorted(vocab)

In [None]:
def get_index(query, vocab):
    'Locate the leftmost value exactly equal to x'
    i = bisect_left(vocab, query)
    if i != len(vocab) and vocab[i] == query:
        return i
    raise ValueError

def iter_embed_1hot(vocab, docs):
    for doc in docs:
        doc_i = []
        doc_j = []
        doc_data = []
        for itoken, token in enumerate(doc['text']):
            try:
                token_index = get_index(token, vocab)
                doc_i.append(itoken)
                doc_j.append(token_index)
                doc_data.append(1)
            except ValueError:
                pass
        yield coo_array((
            np.array(doc_data), 
            (
                np.array(doc_i, dtype=int),
                np.array(doc_j, dtype=int)
            )
        ), shape=(itoken+1, len(vocab)))

for embedding in iter_embed_1hot(vocab, iter_sents(iter_docs(preprocessed_data_fn))):
    break
embedding

## Bag of Words

In [None]:
def iter_embed_bow(vocab, docs):
    for one_hot in iter_embed_1hot(vocab, docs):
        yield one_hot.sum(axis=0)

for embedding in iter_embed_bow(vocab, iter_sents(iter_docs(preprocessed_data_fn))):
    break
embedding

## Word 2 Vec

In [None]:
class W2VDataset:
    def __init__(self, generator_factory):
        self.generator_factory = generator_factory
        self.length = None
 
    def __iter__(self):
        generator = self.generator_factory()
        i=0
        for i, it in enumerate(generator):
            yield it['text']
            
        self.length = i+1
            
    def __len__(self):
        if self.length is not None:
            return self.length
        
        self.length = len([_ for _ in self])
        return self.length
        

def train_w2v(vector_size=50, window=5, min_count=2, max_vocab_size=20000, epochs=2):

    w2vdataset = W2VDataset(
        partial(
            iter_sents, 
            iter_docs(preprocessed_data_fn)
        )
    )

    model = Word2Vec(
        sentences=w2vdataset,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        max_vocab_size=max_vocab_size,
        workers=4
    )

    model.train(
        w2vdataset,
        total_examples=len(w2vdataset),
        epochs=epochs
    )

    wv = model.wv

    vocab = wv.index_to_key
    vocab = sorted(vocab)

    vecs = [wv[it] for it in vocab]

    return pd.DataFrame(data=vecs, index=vocab)

def iter_embed_w2v(vecs, docs):
    for doc in docs:
        doc_representation = []
        for itoken, token in enumerate(doc['text']):
            try:
                doc_representation.append(vecs.loc[token])
            except KeyError:
                pass # some tokens are not embedded
        yield np.vstack(doc_representation).mean(axis=0)
            
w2vs = train_w2v()

In [None]:
for embedding in iter_embed_w2v(w2vs, iter_sents(iter_docs(preprocessed_data_fn))):
    break
embedding

## Authorship attribution

In [None]:
def train_simple_classifier(iterator, label_string):
    
    data = list(iterator)
    labels = [it[label_string] for it in data]
    features = np.vstack(list(iter_embed_bow(vocab, data)))
    
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

    clf = Pipeline([
            ("scaler", StandardScaler()),
            ("classifier", LogisticRegression(random_state=13198)),
    ])
    
    
    clf = clf.fit(train_features, train_labels)
    
    
    pred_labels = clf.predict(test_features)
    
    return f1_score(test_labels, pred_labels, average='macro')

In [None]:
train_simple_classifier(iter_chunks(iter_docs(preprocessed_data_fn)), 'author-name')

In [None]:
scores = []
chunk_sizes = [10, 100, 200, 500, 1000]

for chunk_size in chunk_sizes:
    iterator = iter_chunks(iter_docs(preprocessed_data_fn), n_tokens=chunk_size)
    scores.append(
        train_simple_classifier(iterator, 'author-name')
    )

In [None]:
fig, ax = plt.subplots()
ax.plot(chunk_sizes, scores, label="f1_score")
ax.set_xlabel("n_tokens")
ax.set_ylabel("f1_score")
plt.show()

## Visualize 3 dimensions with baricentric triangles

In [None]:
def triangle(coordinates, labels, ax=None, axis_labels=["", "", ""], colors=None):
    
    if colors is None:
        colors = ['b' for _ in range(len(coordinates))]

    def get_cartesian_from_barycentric(b, t):
        return t.dot(b)
    
    
    triangle = np.transpose(np.array([[0, 0], [1, 0], [0.5, 0.75**0.5]]))
    
    cartesian_coordinates = []
    
    for barycentric_coordinates in coordinates:
        if np.abs(barycentric_coordinates).sum() ==0:
            barycentric_coordinates = np.ones(3)
        barycentric_coordinates = barycentric_coordinates/barycentric_coordinates.sum()
        cartesian_coordinates.append(
            get_cartesian_from_barycentric(barycentric_coordinates, triangle)
        )
    
    
    if ax is None:
        _, ax = plt.subplots(figsize=(5,5))
    
    ax.plot(*triangle.T[:2].T, color="black")
    ax.plot(*triangle.T[1:].T, color="black")
    ax.plot(*triangle.T[[2,0]].T, color="black")
    

    for label, vec, color in zip(labels, cartesian_coordinates, colors):
        
        ax.annotate(label, vec, c=color)
        
        ax.plot(*vec.T, "x", c=color)
    
    offsets = [(-.04, 0), (.01,0), (-.012,.01)]
    for iaxis_label, axis_label in enumerate(axis_labels):
        plt.annotate(axis_label, triangle.T[iaxis_label]+np.array(offsets[iaxis_label]))

    plt.axis('off')
    plt.axis('equal')
    
    return ax

In [None]:
#@title showcase baricentric triangle
a_axis = 1/3 #@param {type:"number", min:0, max:1, step:0.1}
b_axis = 1/5 #@param {type:"number", min:0, max:1, step:0.1}
c_axis = 1/99 #@param {type:"number", min:0, max:1, step:0.1}

triangle(
    np.array([[a_axis,b_axis,c_axis]]),
    ['test'],
    axis_labels=["a", "b", "c"]
)
plt.show()

In [None]:
def get_label(rec):
    author = rec['author-name'][:15].replace(",", "").replace(" ", "-")
    title = rec['book-title'][:10].replace(",", "").replace(" ", "-")
    return f"{author}_{title}"

def get_color(rec):
    return "y" if rec['gender-cat'] == 'm' else 'g'

vals = np.array([
    (
        it['Author-birth'],
        it['Author-death'],
        it['length'],
        get_label(it),
        get_color(it)
    )
    for it in iter_docs(preprocessed_data_fn)
])

vals, labels, colors = vals[:,:-2].astype(int), vals[:,-2], vals[:,-1]

fig, ax = plt.subplots(figsize=(10,10))

vals = vals-vals.min(axis=0)
vals = vals/vals.max(axis=0)

_ = triangle(
    vals,
    ['' for it in labels],
    axis_labels=['Author-birth', 'Author-death', "length"],
    ax=ax,
    colors=colors
)

## Analogy tests

(using word lists from https://github.com/devmount/GermanWordEmbeddings#evaluation)

In [None]:
!git clone https://github.com/devmount/GermanWordEmbeddings.git

In [None]:
def queries_from_lists(list_):
    queries, answers = [], []
    
    for row in list_:
        
        for qw in row:
            queries.append(qw)
            answers.append([])
            for aw in row:
                if aw != qw:
                    answers[-1].append(aw)
                    
    return queries, answers


def load_syntactic_analogies():
    base_path = Path("GermanWordEmbeddings/src")
    
    adjectives = base_path / "adjectives.txt"
    adjectives = [it.split("-")  for it in open(adjectives).read().split("\n")]
    
    nouns = base_path / "nouns.txt"
    nouns = [it.split("-")  for it in open(nouns).read().split("\n")]
    
    verbs = base_path / "verbs.txt"
    verbs = [it.split("-")  for it in open(verbs).read().split("\n")]
    
    return queries_from_lists(adjectives + nouns + verbs)
    

def load_semantic_analogies():
    base_path = Path("GermanWordEmbeddings/src")
    
    bestmatch = base_path / "bestmatch.txt"
    bestmatch = [it.split("-") for it in open(bestmatch).read().split("\n") if ":" not in it]
    
    opposite = base_path / "opposite.txt"
    opposite = [it.split("-")  for it in open(opposite).read().split("\n")]
    
    return queries_from_lists(bestmatch + opposite)


def evaluate(embeddings, queries, answers, n_candidates=20):
    
    pw_distances = pairwise_distances(embeddings.values, metric='cosine')
    pw_distances = pd.DataFrame(
        data=pw_distances,
        columns=embeddings.index,
        index=embeddings.index
    )
    
    num_found_queries = 0
    num_total_queries = len(queries)
    num_found_answers = 0
    num_findable_answers = 0
    num_total_answers = sum(map(len, answers))
    
    for query, answers in zip(queries, answers):
        try:
            found_row = pw_distances.loc[query]
            
            candidates = set(
                found_row.sort_values()[:n_candidates].index.tolist()
            )
            answers = set(answers)

            num_found_queries += 1
            num_findable_answers += len(answers)
            num_found_answers += len(candidates & answers)
    
        except KeyError:
            pass
        
    return {
        "found_queries": num_found_queries/num_total_queries,
        "found_answers": num_found_answers/num_findable_answers
    }
syntactic_queries, syntactic_answers = load_syntactic_analogies()
semantic_queries, semantic_answers = load_semantic_analogies()

In [None]:
embeddings = train_w2v(
    vector_size=50,
    window=5,
    min_count=2,
    max_vocab_size=10000,
    epochs=2
)
print("syntactic")
print(evaluate(embeddings, syntactic_queries, syntactic_answers))

print("semantic")
print(evaluate(embeddings, semantic_queries, semantic_answers))

In [None]:
embeddings = train_w2v(
    vector_size=300,
    window=7,
    min_count=10,
    max_vocab_size=20000,
    epochs=5
)
print("syntactic")
print(evaluate(embeddings, syntactic_queries, syntactic_answers))

print("semantic")
print(evaluate(embeddings, semantic_queries, semantic_answers))