In [38]:
from gensim.models import KeyedVectors, Word2Vec
import spacy
import numpy as np
import re
from itertools import chain
from sklearn.cluster import DBSCAN

from solrhandler import SolrHandler

## Helper

In [2]:
def preprocess_text(text, nlp_model):
    text = re.sub("<[^<]+?>", " ", text)
    text = text.replace("\n", " ") 
    text = " ".join(text.split())
    doc = nlp_model(text)
    return [[tok.lower_ for tok in sent if tok.is_alpha] for sent in doc.sents]

In [3]:
def has_vec_for_word(word, model):
    try:
        model[word]
    except:
        return False
    return True

In [39]:
def get_clusters(kewords, vecs, clustering):
    labels = clustering.fit_predict(vecs)
    elems = [(keword, label) for keword, label in zip(kewords, labels)]
    clusters = [[el[0] for el in elems if el[1] == i] for i in range(len(set(labels)))]
    return clusters

## Preparation

In [4]:
path2base_model = "/Users/jakob/Downloads/german.model"

In [5]:
nlp = spacy.load("de_core_news_sm")

In [6]:
handler = SolrHandler(max_elems=20000)
df = handler.get_df_from_query("*%3A*")

In [7]:
descriptions = df.d115Description.tolist()
descriptions_preprocessed = list(chain(*[preprocess_text(d, nlp) for d in descriptions]))

In [8]:
keywords = set(chain(*df.ssdsLemma.tolist()))
len(keywords)

6443

## Code

### Base Model

In [9]:
model_base = KeyedVectors.load_word2vec_format(path2base_model, binary=True)

In [10]:
keywords_vectorizable_base = [tok for tok in keywords if has_vec_for_word(tok, model_base)]
len(keywords_vectorizable_base)

922

### Fine-Tuned Model

In [11]:
model_tuned = Word2Vec(descriptions_preprocessed, vector_size=300, min_count=1)

In [12]:
model_tuned.build_vocab(descriptions_preprocessed)

In [13]:
total_examples = model_tuned.corpus_count

In [14]:
model_tuned.wv.vectors_lockf = np.ones(len(model_tuned.wv))

In [15]:
model_tuned.wv.intersect_word2vec_format(path2base_model, binary=True, lockf=1.0)

In [16]:
model_tuned.train(descriptions_preprocessed, total_examples=total_examples, epochs=2)

(146263, 191848)

In [17]:
keywords_vectorizable_tuned = [tok for tok in keywords if has_vec_for_word(tok, model_tuned.wv)]
len(keywords_vectorizable_tuned)

2571

### Test Clustering

In [23]:
vecs = model_tuned.wv[keywords_vectorizable_tuned]

In [60]:
clustering = DBSCAN(eps=.07, min_samples=1, metric="cosine")

In [61]:
clusters = get_clusters(keywords_vectorizable_tuned, vecs, clustering)
len(clusters)

923

In [64]:
len([c for c in clusters if len(c) > 1][0]) + len(keywords_vectorizable_base) == len(keywords_vectorizable_tuned)

True