# Install libraries

In [2]:
!pip install bokeh
!pip install sentence-transformers
!pip install umap-learn

Collecting umap-learn
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.10.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: umap-learn, pynndescent
  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Created wheel for umap-learn: filename=umap_learn-0.5.3-py3-none-any.whl size=82816 sha256=cf847f5d3e6d271423923231d7d169767272b0dca7694a7a98597ec85bca42a3
  Stored in directory: /root/.cache/pip/wheels/a0/e8/c6/a37ea663620bd5200ea1ba0907ab3c217042c1d035ef606acc
  Building wheel for pynndescent (setup.py) ... [?25l[?25hdone
  Created wheel for py

# Import libraries

In [8]:
import bokeh.models as bmo # for mapping colors to clusters
from bokeh.models import HoverTool, BoxSelectTool # for plotting
import bokeh.plotting as bp # for plotting
from bokeh.plotting import figure, show, output_notebook, save, output_file # for plotting
from bokeh.palettes import d3 # for cluster colors

import pandas as pd # for data handling

from sentence_transformers import SentenceTransformer # for getting sentence embeddings

from sklearn.manifold import TSNE # for dimension reduction
from sklearn.cluster import KMeans # for clustering

import umap # for dimension reduction

# Load data


In [9]:
# Read survey data
survey_data = pd.read_csv("survey_queries_cluster.csv", encoding="utf-8", sep=",")
survey_data_raw = survey_data
survey_data['query'] = survey_data['query_mod']
survey_data = survey_data.drop(['query_mod', 'thema'], axis = 1)

# Step 1: Filter data


In [10]:
filter_words = ["söder", "markus", "scholz", "olaf", "baerbock", "bärbock", "annalena", "laschet", "armin", "cdu", "csu", "gruene", "grüne", "fdp", "spd", "afd", "npd"]
survey_data = survey_data[~survey_data.stack().str.contains('|'.join(filter_words)).any(level=0)]

data = list(survey_data['query'])

  survey_data = survey_data[~survey_data.stack().str.contains('|'.join(filter_words)).any(level=0)]


In [11]:
data[0:10] # the first samples from the data

['wahlomat über goole.de',
 'partei sicherheit',
 'die linke',
 'wahlinhalte',
 'wahl-o-mat + deinwal',
 'wahlprogramme',
 'parteiprogramm',
 'informationen zu kandidaten,',
 'laptop',
 'klimawandel']

# Step 2: Load the RoBERTa model and generate embedding vectors


In [12]:
model = SentenceTransformer("T-Systems-onsite/german-roberta-sentence-transformer-v2") # the model used, this can be switched with other suitable models found on huggingface.com
embeddings = model.encode(data, show_progress_bar=True)



Batches:   0%|          | 0/55 [00:00<?, ?it/s]

# Step 3: Perform k-means clustering on embedding vectors


In [15]:
# Perform kmeans clustering
num_clusters = 20
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(embeddings)



# Step 4: Plot results via interactive app

In [21]:
# plotting the corresponding word appears when you hover on the data point.
output_notebook()
plot_survey = bp.figure(plot_width=1500, plot_height=1000, title="Clustered search queries from survey",
                        tools="pan,wheel_zoom,box_zoom,reset,save",
                        x_axis_type=None, y_axis_type=None, min_border=1)

# transform embeddings dimensions from 768 to 2 using t-SNE for visualization
tsne_data = TSNE(n_components=2, random_state=42).fit_transform(embeddings)
result = pd.DataFrame(tsne_data, columns=['x', 'y'])

# set labels according to k-means clusters
result['labels_num'] = clustering_model.labels_
result['labels'] = result['labels_num'].astype(str)
# set queries according to actual data
result['query'] = list(survey_data["query"])

palette = d3['Category20'][len(result['labels'].unique())]
color_map = bmo.CategoricalColorMapper(factors=result['labels'].unique(), palette=palette)

# draw single data points into the plot
plot_single = plot_survey.scatter(x='x', y='y',
                                  color={'field': 'labels', 'transform': color_map},
                                  size=4,
                                  alpha=0.8,
                                  source=result,
                                  legend_label="Example query per cluster")

plot_survey.add_tools(HoverTool(renderers=[plot_single], tooltips=[("query", "@query")]))

show(plot_survey)

In [22]:
result

Unnamed: 0,x,y,labels_num,labels,query
0,47.446842,-26.730476,10,10,wahlomat über goole.de
1,-43.129475,16.494520,8,8,partei sicherheit
2,-23.087694,-13.500840,10,10,die linke
3,-0.601526,27.364439,10,10,wahlinhalte
4,57.724289,40.190193,7,7,wahl-o-mat + deinwal
...,...,...,...,...,...
1746,-26.438135,2.975945,18,18,notwendigkeiten der internationalen beziehungen
1747,-44.053589,8.732922,10,10,europäische union
1748,-8.756924,-29.698051,15,15,parteiprogramme
1749,21.096962,41.179817,17,17,wahl o mat
