In [None]:
%%capture
#@title Install dependencies { display-mode: "form" }
!pip install numpy pandas matplotlib scipy scikit-learn
!pip install gensim spacy --upgrade

In [None]:
%%capture
#@title Clone our repository that contains the data { display-mode: "form" }
!git clone https://github.com/millawell/textexplorationen-in-der-digitalen-literaturwissenschaft
%cd textexplorationen-in-der-digitalen-literaturwissenschaft/code/

In [None]:
#@title import dependencies and load data { display-mode: "form" }
#@markdown should a subset of the corpus be used?
SUBSET = True #@param {type:"boolean"}
import json

from pathlib import Path
from functools import partial
from collections import Counter, defaultdict
from bisect import bisect_left

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import coo_matrix
from tqdm.notebook import tqdm
from spacy.lang.de import German
from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from sklearn.metrics import pairwise_distances

def iter_docs(in_path):
    with open(in_path, "r") as in_file:
        for iline, line in tqdm(enumerate(in_file), desc="load documents"):
            data = json.loads(line)
            yield data
            if iline > 20 and SUBSET:
                break
            
            
def iter_sents(docs):
    for doc in docs:
        text = doc['text']
        for isent, sent in tqdm(enumerate(nlp(text).sents), desc="load sentences"):
            sentence = []
            for token in sent:
                sentence.append(token.text)
            yield {
                'text': sentence,
                'sentence_id': isent,
                **{k:v for k,v in doc.items() if k!='text'}
            }
            if isent > 200 and SUBSET:
                break
                
def iter_chunks(docs, n_tokens=100):
    for doc in docs:
        tokenized = [t.text for t in nlp(doc['text'])]
        for i in range(0, len(tokenized), n_tokens):
            yield {
                'text': tokenized[i:i + n_tokens],
                **{k:v for k,v in doc.items() if k!='text'}
            }
            if i//n_tokens>100 and SUBSET:
                break

preprocessed_data_fn = Path("processed_data") / "eltec.jsonl"

nlp = German()
nlp.add_pipe("sentencizer")

for doc in iter_docs(preprocessed_data_fn):
    if len(doc['text']) >= nlp.max_length:
        nlp.max_length = len(doc['text'])+1

all_data = pd.DataFrame(list(iter_docs(preprocessed_data_fn)))
all_data

# 1. Representations
## One-Hot

In [None]:
#@title This cell outputs a one-hot encoding of the first sentence of the first document in the corpus. First, it iterates over the whole corpus to create a vocabulary. As you can see, the output is mostly zeros (*sparse*).{ display-mode: "form" }

vocab, counts = zip(
    *Counter(
        [token.text for doc in iter_docs(preprocessed_data_fn) for token in nlp(doc['text'])]
    ).most_common(10000)
)
vocab = sorted(vocab)

def get_index(query, vocab):
    'Locate the leftmost value exactly equal to x'
    i = bisect_left(vocab, query)
    if i != len(vocab) and vocab[i] == query:
        return i
    raise ValueError

def iter_embed_1hot(vocab, docs):
    for doc in docs:
        doc_i = []
        doc_j = []
        doc_data = []
        for itoken, token in enumerate(doc['text']):
            try:
                token_index = get_index(token, vocab)
                doc_i.append(itoken)
                doc_j.append(token_index)
                doc_data.append(1)
            except ValueError:
                pass
        yield coo_matrix((
            np.array(doc_data), 
            (
                np.array(doc_i, dtype=int),
                np.array(doc_j, dtype=int)
            )
        ), shape=(itoken+1, len(vocab)))

for embedding in iter_embed_1hot(vocab, iter_sents(iter_docs(preprocessed_data_fn))):
    break

embedding

## Bag of Words

In [None]:
#@title This cell outputs a bag-of-words encoding of the first sentence of the first document in the corpus. Actually, it can be computed by summing over the first axis of the one-hot matrix from above.{ display-mode: "form" }

def iter_embed_bow(vocab, docs):
    for one_hot in iter_embed_1hot(vocab, docs):
        yield np.array(one_hot.sum(axis=0)).ravel()

for embedding in iter_embed_bow(vocab, iter_sents(iter_docs(preprocessed_data_fn))):
    break
embedding

## Word 2 Vec

In [None]:
#@title This cell trains word2vec embeddings on the whole corpus. It outputs the average embedding of the first sentence of the first document of the corpus.{ display-mode: "form" }

vector_size=100 #@param {type:"number"}
window=5 #@param {type:"number"}
min_count=1 #@param {type:"number"}
max_vocab_size=20000 #@param {type:"number"}
epochs=20 #@param {type:"number"}

class W2VDataset:
    def __init__(self, generator_factory):
        self.generator_factory = generator_factory
        self.length = None
 
    def __iter__(self):
        generator = self.generator_factory()
        i=0
        for i, it in enumerate(generator):
            yield it['text']
            
        self.length = i+1
            
    def __len__(self):
        if self.length is not None:
            return self.length
        
        self.length = len([_ for _ in self])
        return self.length
        

def train_w2v(vector_size=50, window=5, min_count=2, max_vocab_size=20000, epochs=2):

    w2vdataset = W2VDataset(
        partial(
            iter_sents, 
            iter_docs(preprocessed_data_fn)
        )
    )
    
    model = Word2Vec(
        sentences=w2vdataset,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        max_vocab_size=max_vocab_size,
        workers=4
    )
    
    model.train(
        w2vdataset,
        total_examples=len(w2vdataset),
        epochs=epochs
    )

    wv = model.wv

    vocab = wv.index_to_key
    vocab = sorted(vocab)

    vecs = [wv[it] for it in vocab]

    return pd.DataFrame(data=vecs, index=vocab)

def iter_embed_w2v(vecs, docs):
    for doc in docs:
        doc_representation = []
        for itoken, token in enumerate(doc['text']):
            try:
                doc_representation.append(vecs.loc[token])
            except KeyError:
                pass # some tokens are not embedded
        yield np.vstack(doc_representation).mean(axis=0)
            
w2vs = train_w2v(
    vector_size=vector_size,
    window=window,
    min_count=min_count,
    max_vocab_size=max_vocab_size,
    epochs=epochs
)

for embedding in iter_embed_w2v(w2vs, iter_sents(iter_docs(preprocessed_data_fn))):
    break
embedding

# 2. Authorship attribution



In [None]:
#@title This cell trains trains a classifier for authorship attribution. It outputs how well the method performs with the given parameters.{ display-mode: "form" }

n_tokens_per_sample=1000 #@param {type:"number"}
embedding_method="bow" #@param ['w2v', 'bow']

def train_simple_classifier(iterator, embedding_method, label_string):
    
    data = list(iterator)
    labels = [it[label_string] for it in data]
    features = np.vstack(list(embedding_method(data)))
    
    train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

    clf = Pipeline([
            ("scaler", StandardScaler()),
            ("classifier", LogisticRegression(random_state=13198)),
    ])
    
    
    clf = clf.fit(train_features, train_labels)
    
    
    pred_labels = clf.predict(test_features)
    
    return f1_score(test_labels, pred_labels, average='macro')

iterator = iter_chunks(iter_docs(preprocessed_data_fn), n_tokens=n_tokens_per_sample)
embedding_method = partial(iter_embed_bow, vocab) if embedding_method == 'bow' else partial(iter_embed_w2v, w2vs)
score = train_simple_classifier(
  iter_chunks(iter_docs(preprocessed_data_fn)), 
  embedding_method,
  'author-name'
)
print(f"classifier reached f1 score of {score} on held-out data.")

# 3. Visualize 3 dimensions with baricentric triangles

In [None]:
#@title with the baricentric triangle, 3-dimensional data ('weights') can be visualized ([Baryzentrische_Koordinaten auf Wikipedia](https://de.wikipedia.org/wiki/Baryzentrische_Koordinaten)). You can change the values and see where the point is plotted to get a feeling for it.
a_axis =  .5#@param {type:"number", min:0, max:1, step:0.1}
b_axis =  .8#@param {type:"number", min:0, max:1, step:0.1}
c_axis = 1e-5 #@param {type:"number", min:0, max:1, step:0.1}

def triangle(coordinates, labels, ax=None, axis_labels=["", "", ""], colors=None):
    
    if colors is None:
        colors = ['b' for _ in range(len(coordinates))]

    def get_cartesian_from_barycentric(b, t):
        return t.dot(b)
    
    
    triangle = np.transpose(np.array([[0, 0], [1, 0], [0.5, 0.75**0.5]]))
    
    cartesian_coordinates = []
    
    for barycentric_coordinates in coordinates:
        if np.abs(barycentric_coordinates).sum() ==0:
            barycentric_coordinates = np.ones(3)
        barycentric_coordinates = barycentric_coordinates/barycentric_coordinates.sum()
        cartesian_coordinates.append(
            get_cartesian_from_barycentric(barycentric_coordinates, triangle)
        )
    
    
    if ax is None:
        _, ax = plt.subplots(figsize=(5,5))
    
    ax.plot(*triangle.T[:2].T, color="black")
    ax.plot(*triangle.T[1:].T, color="black")
    ax.plot(*triangle.T[[2,0]].T, color="black")
    

    for label, vec, color in zip(labels, cartesian_coordinates, colors):
        
        ax.annotate(label, vec, c=color)
        
        ax.plot(*vec.T, "x", c=color)
    
    offsets = [(-.04, 0), (.01,0), (-.012,.01)]
    for iaxis_label, axis_label in enumerate(axis_labels):
        plt.annotate(axis_label, triangle.T[iaxis_label]+np.array(offsets[iaxis_label]))

    plt.axis('off')
    plt.axis('equal')
    
    return ax

triangle(
    np.array([[a_axis,b_axis,c_axis]]),
    ['test'],
    axis_labels=["a", "b", "c"]
)
plt.show()

In [None]:
#@title This cell visualizes the corpus data as a baricentric triangle. Two corners can be used for meta-data of the corpus and for the third corner the combined representation of an author can be chosen. All titles will be plotted relative to the distance to this combined author representation. { display-mode: "form" }
first_corner_category = "year" #@param ['Author-birth', 'Author-death', 'year', 'length']
second_corner_category = "length" #@param ['Author-birth', 'Author-death', 'year', 'length']
color_category = "canon" #@param ['gender', 'canon']
#@markdown ---
#@markdown ### 'author representations' corner:
third_corner_author = "Aston, Louise" #@param str
embedding_method="bow" #@param ['w2v', 'bow']
metric = "euclidean" #@param ['euclidean', 'cosine']

docs = list(iter_docs(preprocessed_data_fn))
embedding_method = partial(iter_embed_bow, vocab) if embedding_method == 'bow' else partial(iter_embed_w2v, w2vs)

author_filter = partial(filter, lambda x: x['author-name'] == third_corner_author)
author_representation = np.vstack(list(embedding_method(author_filter(docs))))
author_representation.mean(axis=0)
title_representations = np.vstack(list(embedding_method(docs)))


pw_distances = 1 - np.ravel(pairwise_distances(
    author_representation,
    title_representations,
    metric=metric
))


def get_label(rec):
    author = rec['author-name'][:15].replace(",", "").replace(" ", "-")
    title = rec['book-title'][:10].replace(",", "").replace(" ", "-")
    return f"{author}_{title}"

def get_color(rec):
    if color_category == 'gender':  
      return "y" if rec['gender-cat'] == 'm' else 'g'
    else:
      return "y" if rec['canon-cat'] == 'high' else 'g'

vals = np.array([
    (
        it[first_corner_category],
        it[second_corner_category],
        pw_distances[i],
        get_label(it),
        get_color(it)
    )
    for i, it in enumerate(docs)
])

vals, labels, colors = vals[:,:-2].astype(float), vals[:,-2], vals[:,-1]

fig, ax = plt.subplots(figsize=(10,10))

vals = vals-vals.min(axis=0)
vals = vals/vals.max(axis=0)

_ = triangle(
    vals,
    labels,
    axis_labels=[
                 first_corner_category,
                 second_corner_category,
                 third_corner_author
    ],
    ax=ax,
    colors=colors
)

## 4. Analogy tests

(using word lists from https://github.com/devmount/GermanWordEmbeddings#evaluation)

In [None]:
%%capture
#@title Load and preprocess German analogy test data.  { display-mode: "form" }

!git clone https://github.com/devmount/GermanWordEmbeddings.git

def queries_from_lists(list_):
    queries, answers = [], []
    
    for row in list_:
        
        for qw in row:
            queries.append(qw)
            answers.append([])
            for aw in row:
                if aw != qw:
                    answers[-1].append(aw)
                    
    return queries, answers


def load_syntactic_analogies():
    base_path = Path("GermanWordEmbeddings/src")
    
    adjectives = base_path / "adjectives.txt"
    adjectives = [it.split("-")  for it in open(adjectives).read().split("\n")]
    
    nouns = base_path / "nouns.txt"
    nouns = [it.split("-")  for it in open(nouns).read().split("\n")]
    
    verbs = base_path / "verbs.txt"
    verbs = [it.split("-")  for it in open(verbs).read().split("\n")]
    
    return queries_from_lists(adjectives + nouns + verbs)
    

def load_semantic_analogies():
    base_path = Path("GermanWordEmbeddings/src")
    
    bestmatch = base_path / "bestmatch.txt"
    
    bestmatch_group = open(bestmatch).read().split("\n: ")

    queries = []
    answers = []
    for group in bestmatch_group:
        for pair_i in group.split("\n")[1:]:
            for pair_j in group.split("\n")[1:]:
                if pair_i != pair_j:
                  a, b = pair_i.split("-")
                  c, d = pair_j.split("-")

                  queries.append((a, b, c))
                  answers.append(d)    
    return queries, answers


def evaluate_most_similar(embeddings, queries, answers, n_candidates=20):
    
    pw_distances = pairwise_distances(embeddings.values, metric='cosine')
    pw_distances = pd.DataFrame(
        data=pw_distances,
        columns=embeddings.index,
        index=embeddings.index
    )
    
    num_found_queries = 0
    num_total_queries = len(queries)
    num_found_answers = 0
    num_findable_answers = 0
    num_total_answers = sum(map(len, answers))
    
    for query, answers in zip(queries, answers):
        try:
            found_row = pw_distances.loc[query]
            
            candidates = set(
                found_row.sort_values()[:n_candidates].index.tolist()
            )
            answers = set(answers)

            num_found_queries += 1
            num_findable_answers += len(answers)
            num_found_answers += len(candidates & answers)
    
        except KeyError:
            pass
        
    return {
        "found_queries": num_found_queries/num_total_queries,
        "found_answers": num_found_answers/num_findable_answers
    }

def evaluate_4_analogy(embeddings, queries, answers, n_candidates=20):

    num_found_queries = 0
    num_total_queries = len(queries)
    num_found_answers = 0

    for query, answer in zip(queries,  answers):
      q1, q2, q3 = query

      
      try:
          query = embeddings.values.dot(
              embeddings.loc[q1]
              - embeddings.loc[q2]
              + embeddings.loc[q3]
          )

          found_indices = np.argsort(query)[::-1][:n_candidates]

          candidates = set(embeddings.index[found_indices])
          num_found_queries += 1
          num_found_answers += 1 if answer in candidates else 0
          if answer in candidates:
              print("found!", q1, q2, q3, answer)
          
      except KeyError:
        pass
    return {
        "found_queries": num_found_queries/num_total_queries,
        "found_answers": num_found_answers/num_found_queries
    }


syntactic_queries, syntactic_answers = load_syntactic_analogies()
semantic_queries, semantic_answers = load_semantic_analogies()

In [None]:
#@title Show a few analogy examples.  { display-mode: "form" }
print("Syntactic examples")
for i in range(5):
  print(f"Query: {syntactic_queries[i]}")
  print(f"Answers: {syntactic_answers[i]}")

print("---")
print("Semantic examples")
for i in range(5):
  print(f"Query: {semantic_queries[i]}")
  print(f"Answers: {semantic_answers[i]}")

In [None]:
#@title Evaluate w2v embeddings on syntactic and semantic analogies. { display-mode: "form" }
vector_size=20 #@param {type:"number"}
window=5 #@param {type:"number"}
min_count=2 #@param {type:"number"}
max_vocab_size=1000 #@param {type:"number"}
epochs=2 #@param {type:"number"}

embeddings = train_w2v(
    vector_size=vector_size,
    window=window,
    min_count=min_count,
    max_vocab_size=max_vocab_size,
    epochs=epochs
)
print("syntactic")
print(evaluate_most_similar(embeddings, syntactic_queries, syntactic_answers))

print("semantic")
print(evaluate_4_analogy(embeddings, semantic_queries, semantic_answers))