# Install libraries

In [None]:
!pip install umap-learn
!pip install sentence_transformers
!#pip install hdbscan
!pip install bokeh

Collecting umap-learn
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.10.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: umap-learn, pynndescent
  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Created wheel for umap-learn: filename=umap_learn-0.5.3-py3-none-any.whl size=82816 sha256=ae120a5a2ef64df76157cf75cfad63660908d410d1cad0e15f09110137f9376e
  Stored in directory: /root/.cache/pip/wheels/a0/e8/c6/a37ea663620bd5200ea1ba0907ab3c217042c1d035ef606acc
  Building wheel for pynndescent (setup.py) ... [?25l[?25hdone
  Created wheel for py

# Import libraries

In [None]:
import pandas as pd # for data handling
from sentence_transformers import SentenceTransformer # for getting sentence embeddings
import umap # for dimension reduction
from sklearn.manifold import TSNE # for dimension reduction
# import hdbscan # for clustering
import bokeh.plotting as bp # for plotting
from bokeh.models import HoverTool, BoxSelectTool # for plotting
from bokeh.plotting import figure, show, output_notebook, save, output_file # for plotting
from bokeh.palettes import d3 # for cluster colors
import bokeh.models as bmo # for mapping colors to clusters
from sklearn.cluster import KMeans # for clustering
import numpy as np

# Load data

In [None]:
# Read survey data
survey_data = pd.read_csv("survey_queries_cluster.csv", encoding="utf-8", sep=",")
survey_data_raw = survey_data
survey_data['query'] = survey_data['query_mod']
survey_data = survey_data.drop(['query_mod', 'thema'], axis = 1)

filter_words = ["söder", "markus", "scholz", "olaf", "baerbock", "bärbock", "annalena", "laschet", "armin", "cdu", "csu", "gruene", "grüne", "fdp", "spd", "afd", "npd"]
survey_data = survey_data[~survey_data.stack().str.contains('|'.join(filter_words)).any(level=0)]

data = list(survey_data['query'])

  survey_data = survey_data[~survey_data.stack().str.contains('|'.join(filter_words)).any(level=0)]


In [None]:
data[0:10] # the first samples from the data

['wahlomat über goole.de',
 'partei sicherheit',
 'die linke',
 'wahlinhalte',
 'wahl-o-mat + deinwal',
 'wahlprogramme',
 'parteiprogramm',
 'informationen zu kandidaten,',
 'laptop',
 'klimawandel']

In [None]:
# model = SentenceTransformer('distilbert-base-nli-mean-tokens')
# model = SentenceTransformer('all-MiniLM-L6-v2')
model = SentenceTransformer("T-Systems-onsite/german-roberta-sentence-transformer-v2") # the model used, this can be switched with other suitable models found on huggingface.com
embeddings = model.encode(data, show_progress_bar=True)

Downloading (…)7f314/.gitattributes:   0%|          | 0.00/445 [00:00<?, ?B/s]

Downloading (…)db0ea9b7f314/LICENSE:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

Downloading (…)0ea9b7f314/README.md:   0%|          | 0.00/607 [00:00<?, ?B/s]

Downloading (…)a9b7f314/config.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]



Batches:   0%|          | 0/55 [00:00<?, ?it/s]

In [None]:
# helper function to place aggregated circles more in the middle of the main
# data points clouds
def reject_outliers(data, m=1):
    data = data[abs(data - np.mean(data)) < m * np.std(data)]
    return np.mean(data)

# Plot using umap

In [None]:
# plotting the corresponding word appears when you hover on the data point.
output_notebook()
plot_survey = bp.figure(plot_width=1500, plot_height=1000, title="Clustered search queries from survey",
    tools="pan,wheel_zoom,box_zoom,reset,save",
    x_axis_type=None, y_axis_type=None, min_border=1)

# transform embeddings dimnesions from 768 to 2, for visualisation
umap_data = umap.UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
result = pd.DataFrame(umap_data, columns=['x', 'y'])


# Perform kmeans clustering
num_clusters = 20
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(embeddings)

# set labels according to kmeans clusters
result['labels_num'] = clustering_model.labels_
result['labels'] = result['labels_num'].astype(str)
# set queries according actual data
result['query'] = list(survey_data["query"])

# aggregate data points
#result_grouped = result.groupby("labels_num").agg(
#    x=pd.NamedAgg(column='x', aggfunc=reject_outliers),
#    y=pd.NamedAgg(column='y', aggfunc=reject_outliers),
#    n=pd.NamedAgg(column='labels', aggfunc='count'),)


#result_grouped = result_grouped.reset_index()


# circles are to big, reduce their size
#result_grouped['n'] = result_grouped['n']/3
# sort by label
#result_grouped = result_grouped.sort_values(by=['labels_num'])
# cast to string, bokeh needs as string for labels
#result_grouped['labels'] = result_grouped['labels_num'].astype(str)

#result_grouped['example_query'] = ['parteien aktuell deutschland', 'wahl o mat', 'parteivergleich', 'wahlprogramme', 'wahlomat', 'wahl-o-mat', 'umfragewerte',
#                                        'politik aktuell', 'klimawandel', 'wahlprogramme der parteien', 'freiheit, impfpflicht, steuererhöhung, inflation', 'soziale gerechtigkeit', 'corona', 'wahlprogramm parteien 2021',
#                                        'kanzlerkandidaten', 'wahlprogramme der parteien', 'migrationspolitik', 'themen der parteien', 'parteiprogramme', 'neueste entwicklung']

#result_grouped['legend_labels'] = result_grouped['labels'] + ": " + result_grouped['example_query']


palette = d3['Category20'][len(result['labels'].unique())]
color_map = bmo.CategoricalColorMapper(factors=result['labels'].unique(),
                                   palette=palette)




# draw single data points into the plot
plot_single = plot_survey.scatter(x='x', y='y',
                    color={'field': 'labels', 'transform': color_map},
                    size=4,
                    alpha=0.8,
                    source=result,
                    legend_label="Example query per cluster")

plot_survey.add_tools(HoverTool(renderers=[plot_single], tooltips=[("query","@query")]))

#hover_single = plot_survey.select(dict(type=HoverTool))
#hover_single.tooltips={"query": "@query"}


#draw aggregated data points into the plot
#plot_agg = plot_survey.scatter(x='x', y='y',
#                    color={'field': 'labels', 'transform': color_map},
#                    size={'field': 'n'},
#                    legend='legend_labels',
#                    line_width=2,
#                    alpha=0.3,
#                    source=result_grouped)



plot_survey.add_tools(HoverTool(renderers=[plot_agg], tooltips=[("Cluster Number", "@labels")]))
#hover_agg.tooltips={"Cluster Number": "@labels"}



#output_file('plot_offen.html', mode='inline')
show(plot_survey)



In [None]:
result

Unnamed: 0,x,y,labels_num,labels,query
0,-3.803198,4.557164,9,9,wahlomat über goole.de
1,2.582238,5.266841,10,10,partei sicherheit
2,-20.737700,6.677759,14,14,die linke
3,-2.003380,7.700168,9,9,wahlinhalte
4,34.184830,14.387916,3,3,wahl-o-mat + deinwal
...,...,...,...,...,...
1746,-2.532474,9.654963,16,16,notwendigkeiten der internationalen beziehungen
1747,-3.018118,10.893704,18,18,europäische union
1748,2.207997,-1.215737,13,13,parteiprogramme
1749,3.707172,-10.130649,7,7,wahl o mat


# Plot using t-sne

In [None]:
# plotting the corresponding word appears when you hover on the data point.
output_notebook()
plot_survey = bp.figure(plot_width=1500, plot_height=1000, title="Clustered search queries from survey",
                        tools="pan,wheel_zoom,box_zoom,reset,save",
                        x_axis_type=None, y_axis_type=None, min_border=1)

# transform embeddings dimensions from 768 to 2 using t-SNE for visualization
tsne_data = TSNE(n_components=2, random_state=42).fit_transform(embeddings)
result = pd.DataFrame(tsne_data, columns=['x', 'y'])

# Perform k-means clustering
num_clusters = 20
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(embeddings)

# set labels according to k-means clusters
result['labels_num'] = clustering_model.labels_
result['labels'] = result['labels_num'].astype(str)
# set queries according to actual data
result['query'] = list(survey_data["query"])

# aggregate data points
#result_grouped = result.groupby("labels_num").agg(
#    x=pd.NamedAgg(column='x', aggfunc=reject_outliers),
#    y=pd.NamedAgg(column='y', aggfunc=reject_outliers),
#    n=pd.NamedAgg(column='labels', aggfunc='count'))

#result_grouped = result_grouped.reset_index()

# circles are too big, reduce their size
#result_grouped['n'] = result_grouped['n'] / 3
# sort by label
#result_grouped = result_grouped.sort_values(by=['labels_num'])
# cast to string, Bokeh needs it as a string for labels
#result_grouped['labels'] = result_grouped['labels_num'].astype(str)

#result_grouped['example_query'] = ['parteien aktuell deutschland', 'wahl o mat', 'parteivergleich', 'wahlprogramme',
#                                   'wahlomat', 'wahl-o-mat', 'umfragewerte', 'politik aktuell', 'klimawandel',
#                                   'wahlprogramme der parteien', 'freiheit, impfpflicht, steuererhöhung, inflation',
#                                   'soziale gerechtigkeit', 'corona', 'wahlprogramm parteien 2021', 'kanzlerkandidaten',
#                                   'wahlprogramme der parteien', 'migrationspolitik', 'themen der parteien',
#                                   'parteiprogramme', 'neueste entwicklung']

#result_grouped['legend_labels'] = result_grouped['labels'] + ": " + result_grouped['example_query']

palette = d3['Category20'][len(result['labels'].unique())]
color_map = bmo.CategoricalColorMapper(factors=result['labels'].unique(), palette=palette)

# draw single data points into the plot
plot_single = plot_survey.scatter(x='x', y='y',
                                  color={'field': 'labels', 'transform': color_map},
                                  size=4,
                                  alpha=0.8,
                                  source=result,
                                  legend_label="Example query per cluster")

plot_survey.add_tools(HoverTool(renderers=[plot_single], tooltips=[("query", "@query")]))

# draw aggregated data points into the plot
#plot_agg = plot_survey.scatter(x='x', y='y',
#                               color={'field': 'labels', 'transform': color_map},
#                               size={'field': 'n'},
#                               legend='legend_labels',
#                               line_width=2,
#                               alpha=0.3,
#                               source=result_grouped)

plot_survey.add_tools(HoverTool(renderers=[plot_agg], tooltips=[("Cluster Number", "@labels")]))

show(plot_survey)

