In [1]:
import spacy
from itertools import chain, groupby
from operator import itemgetter
from sklearn.cluster import DBSCAN
import gensim
import numpy as np

from solrhandler import SolrHandler

## Helpers

In [16]:
def preprocess_text(text, nlp_model):
    doc = nlp_model(text)
    text = " ".join([tok.lemma_ for tok in doc if not tok.is_stop and tok.is_alpha])
    return text.lower()

In [17]:
def get_unique_tokens(tokens):
    tokens_unique = []
    token_texts_unique= []
    for tok in tokens:
        if tok.text not in token_texts_unique:
            tokens_unique.append(tok)
            token_texts_unique.append(tok.text)
    return tokens_unique

In [18]:
def flatten_lst(lst):
    return list(chain(*lst))

In [19]:
def get_word2cluster_topic(clusters):
    word2cluster_topic = {}
    for cluster in clusters:
        for elem in cluster:
            word2cluster_topic.update({elem: cluster[0]})
    return word2cluster_topic

In [20]:
def drop_duplicates(lst):
    return list(set(lst))

In [38]:
def get_clusters(tokens, clustering):
    vecs = [tok.vector for tok in tokens]
    labels = clustering.fit_predict(vecs)
    elems = [(tok.text, label) for tok, label in zip(tokens, labels)]
    clusters = [[el[0] for el in elems if el[1] == i] for i in range(len(set(labels)))]
    return clusters

## Preparation

In [21]:
nlp = spacy.load("de_core_news_lg")

In [22]:
handler = SolrHandler(max_elems=20000)
df = handler.get_df_from_query("*%3A*")

In [23]:
lemma_lst = [" ".join(lst) for lst in df.ssdsLemma.tolist()]
lemma_lst = [preprocess_text(lemma_str, nlp) for lemma_str in lemma_lst]

## Keyword Clustering

In [41]:
def get_keywords_clustered(lemma_lst, nlp_model, clustering=DBSCAN(eps=.15, min_samples=1, metric="cosine")):
    lemmas_tokens = [[tok for tok in doc] for doc in nlp_model.pipe(lemma_lst)]
    lemma_tokens_lst = get_unique_tokens(flatten_lst(lemmas_tokens))
    tokens_vectorizable = [tok for tok in lemma_tokens_lst if tok.has_vector]
    tokens_not_vectorizable = [tok for tok in lemma_tokens_lst if tok not in tokens_vectorizable]
    clusters = get_clusters(tokens_vectorizable, clustering)
    word2cluster_topic = get_word2cluster_topic(clusters)
    word2cluster_topic.update({tok.text: tok.text for tok in tokens_not_vectorizable})
    return [drop_duplicates([word2cluster_topic[tok.text] for tok in res]) for res in lemmas_tokens]

In [51]:
clustering = DBSCAN(eps=.3, min_samples=1, metric="cosine")

In [52]:
lemmas_clustered = get_keywords_clustered(lemma_lst, nlp, clustering)