In [1]:
# import relevant modules
import pandas as pd
pd.set_option('display.max_columns', None)
import glob
from tqdm.notebook import tqdm
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')
import ast
import json
import numpy as np
from gensim import models
from gensim.models import Word2Vec
from sklearn import cluster
from sklearn.manifold import TSNE
import plotly.express as px

In [2]:
# read suggestions df
suggestions_df = pd.read_parquet('../../data/BTW17_Suggestions/processed/suggestions.parquet')
suggestions_df['tokens'] = suggestions_df['suggestion'].apply(lambda x: str(x).split(' '))
suggestions_df.head()

Unnamed: 0,date,queryterm,ranking,suggestion,tokens
0,2017-05-29 05:00:01,doris ahnen,1.0,ministerin,[ministerin]
1,2017-05-29 05:00:01,doris ahnen,2.0,http://de.wikipedia.org/wiki/Doris_Ahnen,[http://de.wikipedia.org/wiki/Doris_Ahnen]
2,2017-05-29 05:00:01,doris ahnen,3.0,kinder,[kinder]
3,2017-05-29 05:00:01,doris ahnen,4.0,ehemann,[ehemann]
4,2017-05-29 05:00:01,doris ahnen,5.0,ministerin rheinland pfalz,"[ministerin, rheinland, pfalz]"


In [3]:
# read pretrained word2vec model
model = models.KeyedVectors.load_word2vec_format('../../data/Word2Vec/dewiki_20180420_100d.txt')

In [4]:
# save suggestions tokenized in list
suggestions = [list(x) for x in set(tuple(x) for x in suggestions_df['tokens'].tolist())]

In [5]:
# retrieve suggestion-vectors
data = []
for i in tqdm(range(len(suggestions))):
    mean_vector = []
    for j in reversed(range(len(suggestions[i]))):
        try:
            mean_vector.append(model[suggestions[i][j]])
        except:
            suggestions[i].pop(j)
    vector = np.average(mean_vector, axis=0)
    data.append(vector)
data = [x for x in data if x.shape==(100,)]
data = np.asarray(data)

  0%|          | 0/26274 [00:00<?, ?it/s]

In [6]:
# save suggestions tokenized in list
suggestions = [x for x in suggestions if x]

In [7]:
# kmeans clustering
NUM_CLUSTERS=100

kmeans = cluster.KMeans(n_clusters=NUM_CLUSTERS)
kmeans.fit(data)
 
labels = kmeans.labels_
centroids = kmeans.cluster_centers_

# tsne transformation for plotting in 2d
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(data)

In [8]:
# create output df and plot
sugg_cluster_df = pd.DataFrame(X_tsne, columns=['x', 'y'])
sugg_cluster_df['suggestion'] = suggestions
sugg_cluster_df['cluster'] = labels
sugg_cluster_df.sort_values(by='cluster', inplace=True, ignore_index=True)
sugg_cluster_df['vector'] = [x for x in data]
sugg_cluster_df['cluster'] = sugg_cluster_df['cluster'].apply(str)

fig = px.scatter(sugg_cluster_df, x='x', y='y', color='cluster', hover_name='suggestion',
                 template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.show()

In [9]:
# save to csv
sugg_cluster_df.to_csv('../../data/BTW17_Suggestions/suggestions/cluster.csv')