In [1]:
# import relevant modules
import pandas as pd
pd.set_option('display.max_columns', None)
import glob
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from sklearn import cluster
from sklearn.manifold import TSNE
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import sys
sys.path.append('../scripts/')
from querysuggestion import concat_suggestions, vectorize_suggestions
from clustering import kmeans_suggestions, dbscan_suggestions

In [2]:
# set to *.csv to process all
#path_to_csv = '../../data/BTW17_Suggestions/BTW_COMPLETE/*.csv'
#file_list = glob.glob(path_to_csv)

#start = '2017-05-29'
#end = '2017-10-09'
#suggestions_df = concat_suggestions(file_list, start, end)
#print(f'daterange: {suggestions_df["date"].min()}, {suggestions_df["date"].max()}')

In [3]:
# save to parquet
#suggestions_df.to_parquet('../../data/BTW17_Suggestions/processed/suggestions.parquet')
suggestions_df = pd.read_parquet('../../data/BTW17_Suggestions/processed/suggestions.parquet')

In [4]:
# tokenize suggestions
suggestions_df['tokens'] = suggestions_df['suggestion'].apply(lambda x: str(x).split(' '))
suggestions_df.head(3)

Unnamed: 0,date,queryterm,ranking,suggestion,tokens
0,2017-05-29 05:00:01,doris ahnen,1.0,ministerin,[ministerin]
1,2017-05-29 05:00:01,doris ahnen,2.0,http://de.wikipedia.org/wiki/Doris_Ahnen,[http://de.wikipedia.org/wiki/Doris_Ahnen]
2,2017-05-29 05:00:01,doris ahnen,3.0,kinder,[kinder]


In [5]:
suggestions, vector_data = vectorize_suggestions(suggestions_df)

  0%|          | 0/26274 [00:00<?, ?it/s]

In [6]:
# retrieve unique suggestions and their vectors
suggestions = [x for x in suggestions if x]
for i in range(len(suggestions)):
    if suggestions.count(suggestions[i]) > 1:
        suggestions[i] = None
        vector_data[i] = None
        
suggestions = [x for x in suggestions if x]
vector_data = vector_data[~np.isnan(vector_data).any(axis=1)]
vector_data = np.asarray(vector_data)

In [7]:
# tsne transformation to 2d
tsne = TSNE(n_components=2, random_state=1410)
X_tsne = tsne.fit_transform(vector_data)

In [8]:
%reload_ext autoreload
%autoreload 2
from clustering import dbscan_suggestions

In [9]:
dbscan_scores = pd.DataFrame(data=dbscan_suggestions(X_tsne))

  0%|          | 0/19 [00:00<?, ?it/s]

In [10]:
dbscan_scores.rename(columns={'eps':'Maximale Distanz', 'min_samples':'Minimale Anzahl Punkte pro Cluster',
                              'silhouette_score':'Silhouette Score',
                              'num_cluster':'Anzahl Cluster', 'num_noise':'Anzahl Rauschpunkte'}, inplace=True)

silhouette_df = pd.crosstab(index=dbscan_scores['Maximale Distanz'], columns=dbscan_scores['Minimale Anzahl Punkte pro Cluster'],
                            values=dbscan_scores['Silhouette Score'], aggfunc='mean')

nnoise_df = pd.crosstab(index=dbscan_scores['Maximale Distanz'], columns=dbscan_scores['Minimale Anzahl Punkte pro Cluster'],
                        values=dbscan_scores['Anzahl Rauschpunkte'], aggfunc='mean')

ncluster_df = pd.crosstab(index=dbscan_scores['Maximale Distanz'], columns=dbscan_scores['Minimale Anzahl Punkte pro Cluster'],
                          values=dbscan_scores['Anzahl Cluster'], aggfunc='mean')

fig = make_subplots(rows=1, cols=3, subplot_titles=('Silhouette Score', 'Anzahl Rauschpunkte', 'Anzahl Cluster'),
                    shared_yaxes=True, horizontal_spacing=0.15)

fig.add_trace(go.Heatmap(z=silhouette_df, x=dbscan_scores['Minimale Anzahl Punkte pro Cluster'].unique(),
                         y=dbscan_scores['Maximale Distanz'].unique(),
                         colorscale=px.colors.sequential.RdBu, colorbar_x=0.233, name='Silhoutte Scores'),
              row=1, col=1)

fig.add_trace(go.Heatmap(z=nnoise_df, x=dbscan_scores['Minimale Anzahl Punkte pro Cluster'].unique(),
                                y=dbscan_scores['Maximale Distanz'].unique(),
                                colorscale=px.colors.sequential.RdBu_r, colorbar_x=0.618, name='Anzahl Rauschpunkte'),
              row=1, col=2)

fig.add_trace(go.Heatmap(z=ncluster_df, x=dbscan_scores['Minimale Anzahl Punkte pro Cluster'].unique(),
                                y=dbscan_scores['Maximale Distanz'].unique(),
                                colorscale=px.colors.sequential.RdBu_r, colorbar_x=1, name='Anzahl Cluster'),
              row=1, col=3)

fig.update_traces(hovertemplate='%{z}')
fig.update_annotations(font_size=18)

fig.update_yaxes(title='Epsilon', row=1, col=1)
fig.update_xaxes(title='Mindestanzahl Punkte pro Cluster', row=1, col=1)
fig.update_xaxes(title='Mindestanzahl Punkte pro Cluster', row=1, col=2)
fig.update_xaxes(title='Mindestanzahl Punkte pro Cluster', row=1, col=3)


fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()

In [39]:
# Methode 1: Anzahl Rauschpunkte nicht höher als 30%, dann nach Anzahl Cluster
dbscan_scores[dbscan_scores['Anzahl Rauschpunkte']<len(X_tsne)*0.3].sort_values(by='Anzahl Cluster', ascending=True)

Unnamed: 0,Maximale Distanz,Minimale Anzahl Punkte pro Cluster,Silhouette Score,Anzahl Cluster,Anzahl Rauschpunkte
200,0.95,7,0.413889,766,6217
189,0.9,7,0.468986,812,6985
199,0.95,6,0.358504,856,4878
188,0.9,6,0.418926,915,5524
198,0.95,5,0.313208,957,3698
177,0.85,6,0.468428,970,6231
187,0.9,5,0.367601,1036,4176
166,0.8,6,0.51058,1037,6938
176,0.85,5,0.413622,1127,4729
165,0.8,5,0.467995,1217,5270


In [38]:
# Methode 2: Anzahl Cluster nicht höher als 200, dann nach Anzahl Rauschpunkte
dbscan_scores[dbscan_scores['Anzahl Cluster']<200].sort_values(by='Anzahl Rauschpunkte', ascending=True)

Unnamed: 0,Maximale Distanz,Minimale Anzahl Punkte pro Cluster,Silhouette Score,Anzahl Cluster,Anzahl Rauschpunkte
175,0.80,15,0.807362,194,19063
152,0.70,14,0.838935,182,19852
164,0.75,15,0.835114,167,19894
140,0.65,13,0.855534,192,19927
141,0.65,14,0.852699,162,20468
...,...,...,...,...,...
21,0.10,15,0.998876,4,23911
7,0.05,12,0.999134,4,23924
8,0.05,13,0.999384,3,23938
9,0.05,14,1.000000,1,23964


In [43]:
from sklearn import cluster, metrics
dbscan = cluster.DBSCAN(eps=0.95, min_samples=7).fit(X_tsne)
labels = dbscan.labels_
# drop noise points from labels
tmp = pd.DataFrame()
tmp['labels'] = labels
tmp['vector'] = X_tsne.tolist()
tmp = tmp[tmp['labels']!=-1]
labels_clean = tmp['labels'].tolist()
vectors_clean = np.array(tmp['vector'].tolist())

n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)

print(f'Silhouette Score w/o noise points: {metrics.silhouette_score(vectors_clean, labels_clean)}')
print(f'Estimated number of clusters: {n_clusters}')
print(f'Estimated number of noise points: {n_noise}')
print(f'Noise in percent: {n_noise/len(labels)*100}%')

Silhouette Score w/o noise points: 0.41388864515738666
Estimated number of clusters: 766
Estimated number of noise points: 6217
Noise in percent: 25.927933939444493%


In [44]:
# create output df and plot
output_df = pd.DataFrame(X_tsne, columns=['t-SNE(x)', 't-SNE(y)'])
output_df['suggestion'] = suggestions
output_df['cluster'] = labels
output_df.sort_values(by='cluster', inplace=True, ignore_index=True)
output_df['vector'] = [x for x in X_tsne]
output_df['cluster'] = output_df['cluster'].apply(str)

# save output df
output_df.to_json('../../data/BTW17_Suggestions/suggestions/cluster.json')

In [45]:
output_df.rename(columns={'cluster':'Cluster', 'suggestion':'Suggestion'}, inplace=True)
fig = px.scatter(output_df, x='t-SNE(x)', y='t-SNE(y)', color='Cluster', hover_name='Suggestion',
                 template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()

In [46]:
tmp = pd.DataFrame()
tmp['Cluster'] = output_df['Cluster'].value_counts().index
tmp['Clustergröße'] = output_df['Cluster'].value_counts().values
fig = px.box(tmp[tmp['Cluster']!='-1'], y='Clustergröße', points='all',
             template='simple_white', color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(font=dict(family='Computer Modern', color='black', size=15))
fig.show()