In [1]:
# import relevant modules
import pandas as pd
pd.set_option('display.max_columns', None)
import glob
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from sklearn import cluster
from sklearn.manifold import TSNE
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import sys
sys.path.append('../scripts/')
from querysuggestion import concat_suggestions, vectorize_suggestions
from clustering import kmeans_suggestions, dbscan_suggestions

In [2]:
# set to *.csv to process all
#path_to_csv = '../../data/BTW17_Suggestions/BTW_COMPLETE/*.csv'
#file_list = glob.glob(path_to_csv)

#start = '2017-05-29'
#end = '2017-10-09'
#suggestions_df = concat_suggestions(file_list, start, end)
#print(f'daterange: {suggestions_df["date"].min()}, {suggestions_df["date"].max()}')

In [3]:
# save to parquet
#suggestions_df.to_parquet('../../data/BTW17_Suggestions/processed/suggestions.parquet')
suggestions_df = pd.read_parquet('../../data/BTW17_Suggestions/processed/suggestions.parquet')

In [4]:
# tokenize suggestions
suggestions_df['tokens'] = suggestions_df['suggestion'].apply(lambda x: str(x).split(' '))
suggestions_df.head(3)

Unnamed: 0,date,queryterm,ranking,suggestion,tokens
0,2017-05-29 05:00:01,doris ahnen,1.0,ministerin,[ministerin]
1,2017-05-29 05:00:01,doris ahnen,2.0,http://de.wikipedia.org/wiki/Doris_Ahnen,[http://de.wikipedia.org/wiki/Doris_Ahnen]
2,2017-05-29 05:00:01,doris ahnen,3.0,kinder,[kinder]


In [5]:
suggestions, vector_data = vectorize_suggestions(suggestions_df)

  0%|          | 0/26274 [00:00<?, ?it/s]

In [6]:
# tsne transformation for plotting in 2d
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(vector_data)

In [54]:
%load_ext autoreload
%autoreload 2
from clustering import dbscan_suggestions

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [55]:
dbscan_scores = pd.DataFrame(data=dbscan_suggestions(X_tsne))

  0%|          | 0/19 [00:00<?, ?it/s]

In [107]:
dbscan_scores.rename(columns={'eps':'Maximale Distanz', 'min_samples':'Minimale Anzahl Punkte pro Cluster',
                              'silhouette_score':'Silhouette Score',
                              'num_cluster':'Anzahl Cluster', 'num_noise':'Anzahl Rauschpunkte'}, inplace=True)

silhouette_df = pd.crosstab(index=dbscan_scores['Maximale Distanz'], columns=dbscan_scores['Minimale Anzahl Punkte pro Cluster'],
                            values=dbscan_scores['Silhouette Score'], aggfunc='mean')

nnoise_df = pd.crosstab(index=dbscan_scores['Maximale Distanz'], columns=dbscan_scores['Minimale Anzahl Punkte pro Cluster'],
                        values=dbscan_scores['Anzahl Rauschpunkte'], aggfunc='mean')

ncluster_df = pd.crosstab(index=dbscan_scores['Maximale Distanz'], columns=dbscan_scores['Minimale Anzahl Punkte pro Cluster'],
                          values=dbscan_scores['Anzahl Cluster'], aggfunc='mean')

fig = make_subplots(rows=1, cols=3, subplot_titles=('Silhouette Score', 'Anzahl Rauschpunkte', 'Anzahl Cluster'))

fig.add_trace(go.Heatmap(z=silhouette_df, x=dbscan_scores['Minimale Anzahl Punkte pro Cluster'].unique(),
                                y=dbscan_scores['Maximale Distanz'].unique(),
                                colorscale=px.colors.sequential.RdBu, showscale=False), row=1, col=1)

fig.add_trace(go.Heatmap(z=nnoise_df, x=dbscan_scores['Minimale Anzahl Punkte pro Cluster'].unique(),
                                y=dbscan_scores['Maximale Distanz'].unique(),
                                colorscale=px.colors.sequential.RdBu_r, showscale=False), row=1, col=2)

fig.add_trace(go.Heatmap(z=ncluster_df, x=dbscan_scores['Minimale Anzahl Punkte pro Cluster'].unique(),
                                y=dbscan_scores['Maximale Distanz'].unique(),
                                colorscale=px.colors.sequential.RdBu_r, showscale=False), row=1, col=3)

fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()

In [None]:
# tmp = dbscan_scores[dbscan_scores['Minimale Anzahl Punkte pro Cluster']==1]

# fig = make_subplots(2, 2)
# fig.add_trace(go.Scatter(x=tmp['Maximale Distanz'], y=tmp['Silhouette Score'],
#                          name='Silhouette Score',
#                          line=dict(color='rgb(133, 92, 117)')), row=1, col=1)
# fig.add_trace(go.Scatter(x=tmp['Maximale Distanz'], y=tmp['Calinski Harabasz Score'],
#                          name='Calinski Harabasz Score',
#                          line=dict(color='rgb(217, 175, 107)')), row=1, col=2)
# fig.add_trace(go.Scatter(x=tmp['Maximale Distanz'], y=tmp['Anzahl Cluster'],
#                          name='Anzahl Cluster',
#                          line=dict(color='rgb(175, 100, 88)')), row=2, col=1)
# fig.add_trace(go.Scatter(x=tmp['Maximale Distanz'], y=tmp['Anzahl Rauschpunkte'],
#                          name='Anzahl Rauschpunkte',
#                          line=dict(color='rgb(115, 111, 76)')), row=2, col=2)

# fig.update_layout(template='simple_white',
#                   font=dict(family='Computer Modern', color='black', size=15))
# fig.show()

In [83]:
from sklearn import cluster, metrics
dbscan = cluster.DBSCAN(eps=0.95, min_samples=11).fit(X_tsne)
labels = dbscan.labels_
# drop noise points from labels
tmp = pd.DataFrame()
tmp['labels'] = labels
tmp['vector'] = X_tsne.tolist()
tmp = tmp[tmp['labels']!=-1]
labels_clean = tmp['labels'].tolist()
vectors_clean = np.array(tmp['vector'].tolist())

n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)

print(f'Silhouette Score w/o noise points: {metrics.silhouette_score(vectors_clean, labels_clean)}')
print(f'Estimated number of clusters: {n_clusters}')
print(f'Estimated number of noise points: {n_noise}')
print(f'Noise in percent: {n_noise/len(labels)*100}%')

Silhouette Score w/o noise points: 0.6000892728070688
Estimated number of clusters: 513
Estimated number of noise points: 12150
Noise in percent: 47.98388689230283%


In [84]:
# save suggestions tokenized in list
suggestions = [x for x in suggestions if x]

# create output df and plot
output_df = pd.DataFrame(X_tsne, columns=['t-SNE(x)', 't-SNE(y)'])
output_df['suggestion'] = suggestions
output_df['cluster'] = labels
output_df.sort_values(by='cluster', inplace=True, ignore_index=True)
output_df['vector'] = [x for x in vector_data]
output_df['cluster'] = output_df['cluster'].apply(str)

# save output df
output_df.to_json('../../data/BTW17_Suggestions/suggestions/cluster.json')

In [85]:
output_df.rename(columns={'cluster':'Cluster', 'suggestion':'Suggestion'}, inplace=True)
fig = px.scatter(output_df, x='t-SNE(x)', y='t-SNE(y)', color='Cluster', hover_name='Suggestion',
                 template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()

In [86]:
tmp = pd.DataFrame()
tmp['Cluster'] = output_df['Cluster'].value_counts().index
tmp['Clustergröße'] = output_df['Cluster'].value_counts().values
fig = px.box(tmp[tmp['Cluster']!='-1'], y='Clustergröße', points='all',
             template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()