In [1]:
# import relevant modules
import pandas as pd
pd.set_option('display.max_columns', None)
import glob
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from sklearn import cluster
from sklearn.manifold import TSNE
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import sys
sys.path.append('../scripts/')
from querysuggestion import concat_suggestions, vectorize_suggestions
from clustering import kmeans_suggestions, dbscan_suggestions

In [2]:
# set to *.csv to process all
#path_to_csv = '../../data/BTW17_Suggestions/BTW_COMPLETE/*.csv'
#file_list = glob.glob(path_to_csv)

#start = '2017-05-29'
#end = '2017-10-09'
#suggestions_df = concat_suggestions(file_list, start, end)
#print(f'daterange: {suggestions_df["date"].min()}, {suggestions_df["date"].max()}')

In [3]:
# save to parquet
#suggestions_df.to_parquet('../../data/BTW17_Suggestions/processed/suggestions.parquet')
suggestions_df = pd.read_parquet('../../data/BTW17_Suggestions/processed/suggestions.parquet')

In [4]:
# tokenize suggestions
suggestions_df['tokens'] = suggestions_df['suggestion'].apply(lambda x: str(x).split(' '))
suggestions_df.head(3)

Unnamed: 0,date,queryterm,ranking,suggestion,tokens
0,2017-05-29 05:00:01,doris ahnen,1.0,ministerin,[ministerin]
1,2017-05-29 05:00:01,doris ahnen,2.0,http://de.wikipedia.org/wiki/Doris_Ahnen,[http://de.wikipedia.org/wiki/Doris_Ahnen]
2,2017-05-29 05:00:01,doris ahnen,3.0,kinder,[kinder]


In [5]:
suggestions, vector_data = vectorize_suggestions(suggestions_df)

  0%|          | 0/26274 [00:00<?, ?it/s]

In [6]:
# tsne transformation for plotting in 2d
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(vector_data)

In [7]:
kmeans_scores = pd.DataFrame(data=kmeans_suggestions(X_tsne, 10, 100))

  0%|          | 0/91 [00:00<?, ?it/s]

In [10]:
kmeans_scores.rename(columns={'calinski_harabasz_score':'Calinski Harabasz Score',
                              'silhouette_score':'Silhouette Score',
                              'num_cluster':'Anzahl Cluster'}, inplace=True)


fig = make_subplots(1, 2)
fig.add_trace(go.Scatter(x=kmeans_scores['Anzahl Cluster'], y=kmeans_scores['Silhouette Score'],
                         name='Silhouette Score',
                         line=dict(color='rgb(133, 92, 117)')), row=1, col=1)
fig.add_trace(go.Scatter(x=kmeans_scores['Anzahl Cluster'], y=kmeans_scores['Calinski Harabasz Score'],
                         name='Calinski Harabasz Score',
                         line=dict(color='rgb(217, 175, 107)')), row=1, col=2)

fig.update_yaxes(title_text='Silhouette Score', row=1, col=1)
fig.update_yaxes(title_text='Calinski Harabasz Score', row=1, col=2)

fig.update_xaxes(title_text='Anzahl Cluster', row=1, col=1)
fig.update_xaxes(title_text='Anzahl Cluster', row=1, col=2)

fig.update_layout(template='simple_white',
                  font=dict(family='Computer Modern', color='black', size=15))
fig.show()

In [18]:
%reload_ext autoreload
%autoreload 2
from clustering import dbscan_suggestions

In [19]:
dbscan_scores = pd.DataFrame(data=dbscan_suggestions(X_tsne))

  0%|          | 0/19 [00:00<?, ?it/s]

In [21]:
dbscan_scores.rename(columns={'eps':'Maximale Distanz', 'silhouette_score':'Silhouette Score',
                              'calinski_harabasz_score': 'Calinski Harabasz Score',
                              'num_cluster':'Anzahl Cluster', 'num_noise':'Anzahl Rauschpunkte'}, inplace=True)

fig = make_subplots(1, 2)
fig.add_trace(go.Scatter(x=dbscan_scores['Maximale Distanz'], y=dbscan_scores['Silhouette Score'],
                         name='Silhouette Score',
                         line=dict(color='rgb(133, 92, 117)')), row=1, col=1)
fig.add_trace(go.Scatter(x=dbscan_scores['Maximale Distanz'], y=dbscan_scores['Calinski Harabasz Score'],
                         name='Calinski Harabasz Score',
                         line=dict(color='rgb(217, 175, 107)')), row=1, col=2)

fig.update_yaxes(title_text='Silhouette Score', row=1, col=1)
fig.update_yaxes(title_text='Calinski Harabasz Score', row=1, col=2)

fig.update_xaxes(title_text='Maximale Distanz', row=1, col=1)
fig.update_xaxes(title_text='Maximale Distanz', row=1, col=2)

fig.update_layout(template='simple_white',
                  font=dict(family='Computer Modern', color='black', size=15))
fig.show()

Aktuell: Entscheidung nach Calinski Harabasz Score für: kmeans mit 94 Clustern.

In [24]:
from sklearn import cluster, metrics
kmeans = cluster.KMeans(n_clusters=94)
kmeans.fit(X_tsne)
labels = kmeans.predict(X_tsne)
print(f'Silhouette Score: {metrics.silhouette_score(X_tsne, labels)}\nCalinski Harabasz Score: {metrics.calinski_harabasz_score(X_tsne, labels)}')

Silhouette Score: 0.36478814482688904
Calinski Harabasz Score: 24311.570353318948


In [27]:
# save suggestions tokenized in list
suggestions = [x for x in suggestions if x]

# create output df and plot
output_df = pd.DataFrame(X_tsne, columns=['t-SNE(x)', 't-SNE(y)'])
output_df['suggestion'] = suggestions
output_df['cluster'] = labels
output_df.sort_values(by='cluster', inplace=True, ignore_index=True)
output_df['vector'] = [x for x in vector_data]
output_df['cluster'] = output_df['cluster'].apply(str)

# save output df
output_df.to_json('../../data/BTW17_Suggestions/suggestions/cluster.json')

In [28]:
output_df.rename(columns={'cluster':'Cluster', 'suggestion':'Suggestion'}, inplace=True)
fig = px.scatter(output_df, x='t-SNE(x)', y='t-SNE(y)', color='Cluster', hover_name='Suggestion',
                 template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()