In [None]:
%%capture
#@title Install dependencies { display-mode: "form" }
!pip install numpy pandas matplotlib 
!pip install gensim spacy --upgrade

In [None]:
%%capture
#@title import dependencies and load data { display-mode: "form" }
DEBUG = True #@param {type:"boolean"}

import pandas as pd
import numpy as np
import scripts.utils as utils
from tqdm import tqdm
import json
import itertools
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from gensim.models import Word2Vec

from functools import partial
from pathlib import Path

from spacy.lang.de import German
nlp = German()
nlp.add_pipe("sentencizer")

preprocessed_data_fn = Path("processed_data") / "eltec.jsonl"

w2vs = {}
vocab = {}
embedding_matrix = {}

def iter_docs(in_path):
    with open(in_path, "r") as in_file:
        for iline, line in tqdm(enumerate(in_file), desc="load documents"):
            data = json.loads(line)
            yield data
            if iline > 10 and DEBUG:
                break
                
for doc in iter_docs(preprocessed_data_fn):
    if len(doc['text']) >= nlp.max_length:
        nlp.max_length = len(doc['text'])+1

def iter_sents(docs):
    for doc in docs:
        text = doc['text']
        for isent, sent in tqdm(enumerate(nlp(text).sents), desc="load sentences"):
            sentence = []
            for token in sent:
                sentence.append(token.text)
            yield {
                'text': sentence,
                'sentence_id': isent,
                **{k:v for k,v in doc.items() if k!='text'}
            }
            if isent > 100 and DEBUG:
                break
                
def iter_gender(iterable, gender='f'):
    if gender not in ['m', 'f']:
        raise ValueError('gender must be either "m" or "f", atm no other genders in the data :/')
    for it in iterable:
        if it['gender-cat']==gender:
             yield it
                
def iter_author(iterable, author='Willkomm, Ernst Adolf'):
    for it in iterable:
        if it['author-name']==author:
             yield it

In [None]:
%%capture
#@title helper functions for w2v { display-mode: "form" }
class W2VDataset:
    def __init__(self, generator_factory):
        self.generator_factory = generator_factory
        self.length = None
 
    def __iter__(self):
        generator = self.generator_factory()
        i=0
        for i, it in enumerate(generator):
            yield it['text']
            
        self.length = i+1
            
    def __len__(self):
        if self.length is not None:
            return self.length
        
        self.length = len([_ for _ in self])
        return self.length
        

def train_w2v(vector_size=50, window=5, min_count=2, max_vocab_size=20000, epochs=2, split='gender', attribute='f'):
    
    if split=='gender':
        w2vdataset = W2VDataset(
            partial(
                iter_sents,
                (iter_gender(iter_docs(preprocessed_data_fn), gender=attribute))
            )
        )
        
    elif split=='author':
        
        authors = pd.DataFrame(iter_sents(iter_docs(preprocessed_data_fn)))['author-name'].unique()
        if attribute not in authors:
            raise ValueError('author not in list, check spelling: last name: first name1, first name2')
        
        w2vdataset = W2VDataset(
            partial(
                iter_sents,
                (iter_author(iter_docs(preprocessed_data_fn), author=attribute))
            )
        )
       
    else:
        w2vdataset = W2VDataset(
            partial(
                iter_sents, 
                iter_docs(preprocessed_data_fn)
            )
        )
    
    model = Word2Vec(
        sentences=w2vdataset,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        max_vocab_size=max_vocab_size,
        workers=4
    )
    
    model.train(
        w2vdataset,
        total_examples=len(w2vdataset),
        epochs=epochs
    )

    wv = model.wv

    vocab = wv.index_to_key
    vocab = sorted(vocab)

    vecs = [wv[it] for it in vocab]

    return pd.DataFrame(data=vecs, index=vocab)

def iter_embed_w2v(vecs, docs):
    for doc in docs:
        doc_representation = []
        for itoken, token in enumerate(doc['text']):
            try:
                doc_representation.append(vecs.loc[token])
            except KeyError:
                pass # some tokens are not embedded
        yield np.vstack(doc_representation).mean(axis=0)
        
        
def compute_tsne(words, vocab, embedding_matrix):
    return words, TSNE(
        n_components=2, 
        learning_rate='auto',
        init='random').fit_transform(utils.lookup_embeddings(words, 
                                                             vocab, 
                                                             embedding_matrix))

def plot_tsne(words, tsne_rep):
    plt.scatter(tsne_rep[:,0], tsne_rep[:,1])
    for iword, word in enumerate(words):
        plt.annotate(word, tsne_rep[iword])
        

def nearest_neighbors(target, vocab, embedding_matrix, n=25, printout=True):
    target_emb = utils.lookup_embeddings([target], vocab, embedding_matrix)
    sim_matrix = np.dot(target_emb/np.linalg.norm(target_emb), 
                        embedding_matrix.T/np.linalg.norm(embedding_matrix, axis=1)[:, None].T)

    nearest = [vocab[ii] for ii in np.flip(np.argsort(sim_matrix), axis=1)[0,:n]]
    if printout:
        print(nearest)
            
    return nearest

In [None]:
#@title This cell trains word2vec embeddings on the whole corpus.{ display-mode: "form" }

vector_size=20 #@param {type:"number"}
window=5 #@param {type:"number"}
min_count=2 #@param {type:"number"}
max_vocab_size=20000 #@param {type:"number"}
epochs=2 #@param {type:"number"}
split='gender' #@param ["all", "gender", "author", "book"]
attribute='f' #@param {type:"string"}

w2vs[attribute] = train_w2v(
    vector_size=vector_size,
    window=window,
    min_count=min_count,
    max_vocab_size=max_vocab_size,
    epochs=epochs,
    split=split,
    attribute=attribute
)

vocab[attribute] = w2vs[attribute].index
embedding_matrix[attribute] = w2vs[attribute].values

In [None]:
for attribute in w2vs.keys():

    nearest = nearest_neighbors('Franzose', vocab[attribute], embedding_matrix[attribute], n=10, printout=False)
    words, tsne_rep = compute_tsne(nearest, vocab[attribute], embedding_matrix[attribute])
    plot_tsne(words, tsne_rep)

plt.legend(w2vs.keys())