In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
import pandas as pd

In [3]:
import lib
import plotting

In [4]:
from multiprocessing import Pool
import pickle
from collections import defaultdict

In [5]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score, calinski_harabaz_score
from sklearn.decomposition import PCA

In [6]:
from configparser import ConfigParser
cfg = ConfigParser()
cfg.read("config.cfg")

['config.cfg']

In [7]:
from tqdm._tqdm_notebook import tqdm_notebook as tqdm # we manually import the notebook submodule as the normal one struggles with jupyterlab
tqdm.pandas() # this enables us to use progress_apply instead of apply

In [8]:
sequence_file = os.path.join(cfg.get("directory", "exchange"), "[wikidata]004_sequences.p")
labels_file = os.path.join(cfg.get("directory", "exchange"), "[wikidata]004_labels.p")
results_prefix = "[WIKIDATA]"

In [9]:
df = pd.read_pickle(sequence_file)
labels = pd.read_pickle(labels_file)['label'].values
df.head()

Unnamed: 0,length,sequence,user_id
0,1523,"[CLAIM_CREATE, CLAIM_CREATE, BREAK, DESCRIPTIO...",1
1,8,"[SITELINK_ADD, ENTITY_CREATE, BREAK, MERGE, BR...",1000036
2,7,"[SITELINK_ADD, BREAK, SITELINK_ADD, BREAK, SIT...",1000078
3,8,"[SITELINK_REMOVE, SITELINK_ADD, BREAK, SITELIN...",100008
4,35,"[SITELINK_ADD, SITELINK_ADD, BREAK, SITELINK_A...",100012


In [10]:
def calc_dist(sequence):
    dist, pivot = lib.calc_distribution(labels, sequence)
    return np.array(dist)

with Pool(cfg.getint("core", "num_cores")) as processor_pool:
    df['stat_dist'] = pd.Series(processor_pool.imap(calc_dist, tqdm(df['sequence'])))

HBox(children=(IntProgress(value=0, max=87480), HTML(value='')))




In [11]:
df.head()

Unnamed: 0,length,sequence,user_id,stat_dist
0,1523,"[CLAIM_CREATE, CLAIM_CREATE, BREAK, DESCRIPTIO...",1,"[0.022328090570691267, 0.01999355101471073, 0...."
1,8,"[SITELINK_ADD, ENTITY_CREATE, BREAK, MERGE, BR...",1000036,"[0.020189392589955717, 0.020189392589955717, 0..."
2,7,"[SITELINK_ADD, BREAK, SITELINK_ADD, BREAK, SIT...",1000078,"[0.01947397679104996, 0.019473976791049966, 0...."
3,8,"[SITELINK_REMOVE, SITELINK_ADD, BREAK, SITELIN...",100008,"[0.020283553057881, 0.020283553057881005, 0.02..."
4,35,"[SITELINK_ADD, SITELINK_ADD, BREAK, SITELINK_A...",100012,"[0.019736842105263157, 0.019736842105263153, 0..."


In [12]:
kernel_range = np.arange(cfg.getint("kmeans", "clusters_min"), cfg.getint("kmeans", "clusters_max") + 1)
kernel_range

array([ 2,  3,  4,  5,  6,  7,  8,  9, 10])

In [13]:
vectors = np.array([np.array(x) for x in df['stat_dist'].values])
pca = PCA(n_components=3)
plot_vectors = pca.fit_transform(vectors)

In [15]:
centroids = {}    
centroids_file = os.path.join(cfg.get("directory", "exchange"), results_prefix + "001_centroids.p")
load_centroids = cfg.getboolean("kmeans", "load_centroids") and os.path.isfile(centroids_file)
for num_centroids in tqdm(kernel_range):    
    kmeans = None

    print("Num Centroids: {n}".format(n=num_centroids))
    if load_centroids:
        with open(centroids_file, "rb") as dump_file:
            centroids = pickle.load(dump_file)
        kmeans = KMeans(n_clusters=num_centroids, init=centroids[num_centroids])
    else:
        kmeans = KMeans(n_clusters=num_centroids)
        
    kmeans.fit_predict(vectors)
    
    lbl = kmeans.labels_
    centroids[num_centroids] = kmeans.cluster_centers_
    if cfg.getboolean("kmeans", "store_centroids"):
        with open(centroids_file, "wb") as dump_file:
            pickle.dump(centroids, dump_file)
    
    silhouette_avg = silhouette_score(vectors, lbl)
    print("SILHOUETTE", silhouette_avg)
    calinski_score = calinski_harabaz_score(vectors, lbl)
    print("CALINSKI", calinski_score)
    sample_silhouette_values = silhouette_samples(vectors, lbl)
    
    store_path = os.path.join(cfg.get("directory", "results"), results_prefix + "kmeans[{n}]".format(n=num_centroids))
    cluster_lbl = "kmeans_{n}".format(n=num_centroids)
    df[cluster_lbl] = lbl
    plotting.k_means(plot_vectors, num_centroids, lbl, sample_silhouette_values, silhouette_avg, store_path)

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))

Num Centroids: 2


  return_n_iter=True)


SILHOUETTE 0.24465049950794304
CALINSKI 26715.340842608774
data/results/[WIKIDATA]kmeans[2]_silhouette.png
data/results/[WIKIDATA]kmeans[2]_silhouette.pdf
data/results/[WIKIDATA]kmeans[2]_clusters.png
data/results/[WIKIDATA]kmeans[2]_clusters.pdf
data/results/[WIKIDATA]kmeans[2]_population.png
data/results/[WIKIDATA]kmeans[2]_population.pdf
Num Centroids: 3


  return_n_iter=True)


SILHOUETTE 0.312124083339527
CALINSKI 26486.152975234585
data/results/[WIKIDATA]kmeans[3]_silhouette.png
data/results/[WIKIDATA]kmeans[3]_silhouette.pdf
data/results/[WIKIDATA]kmeans[3]_clusters.png
data/results/[WIKIDATA]kmeans[3]_clusters.pdf
data/results/[WIKIDATA]kmeans[3]_population.png
data/results/[WIKIDATA]kmeans[3]_population.pdf
Num Centroids: 4


  return_n_iter=True)


SILHOUETTE 0.32273459723275877
CALINSKI 26742.256911854598
data/results/[WIKIDATA]kmeans[4]_silhouette.png
data/results/[WIKIDATA]kmeans[4]_silhouette.pdf
data/results/[WIKIDATA]kmeans[4]_clusters.png
data/results/[WIKIDATA]kmeans[4]_clusters.pdf
data/results/[WIKIDATA]kmeans[4]_population.png
data/results/[WIKIDATA]kmeans[4]_population.pdf
Num Centroids: 5


  return_n_iter=True)


SILHOUETTE 0.3341999833138369
CALINSKI 24810.875062398143
data/results/[WIKIDATA]kmeans[5]_silhouette.png
data/results/[WIKIDATA]kmeans[5]_silhouette.pdf
data/results/[WIKIDATA]kmeans[5]_clusters.png
data/results/[WIKIDATA]kmeans[5]_clusters.pdf
data/results/[WIKIDATA]kmeans[5]_population.png
data/results/[WIKIDATA]kmeans[5]_population.pdf
Num Centroids: 6


  return_n_iter=True)


SILHOUETTE 0.35230396559897215
CALINSKI 24219.38328431312
data/results/[WIKIDATA]kmeans[6]_silhouette.png
data/results/[WIKIDATA]kmeans[6]_silhouette.pdf
data/results/[WIKIDATA]kmeans[6]_clusters.png
data/results/[WIKIDATA]kmeans[6]_clusters.pdf
data/results/[WIKIDATA]kmeans[6]_population.png
data/results/[WIKIDATA]kmeans[6]_population.pdf
Num Centroids: 7


  return_n_iter=True)


SILHOUETTE 0.3656274167431354
CALINSKI 24545.213558298172
data/results/[WIKIDATA]kmeans[7]_silhouette.png
data/results/[WIKIDATA]kmeans[7]_silhouette.pdf
data/results/[WIKIDATA]kmeans[7]_clusters.png
data/results/[WIKIDATA]kmeans[7]_clusters.pdf
data/results/[WIKIDATA]kmeans[7]_population.png
data/results/[WIKIDATA]kmeans[7]_population.pdf
Num Centroids: 8


  return_n_iter=True)


SILHOUETTE 0.3790315186931245
CALINSKI 24358.32844088437
data/results/[WIKIDATA]kmeans[8]_silhouette.png
data/results/[WIKIDATA]kmeans[8]_silhouette.pdf
data/results/[WIKIDATA]kmeans[8]_clusters.png
data/results/[WIKIDATA]kmeans[8]_clusters.pdf
data/results/[WIKIDATA]kmeans[8]_population.png
data/results/[WIKIDATA]kmeans[8]_population.pdf
Num Centroids: 9


  return_n_iter=True)


SILHOUETTE 0.3823363999691437
CALINSKI 25264.940019563463
data/results/[WIKIDATA]kmeans[9]_silhouette.png
data/results/[WIKIDATA]kmeans[9]_silhouette.pdf
data/results/[WIKIDATA]kmeans[9]_clusters.png
data/results/[WIKIDATA]kmeans[9]_clusters.pdf
data/results/[WIKIDATA]kmeans[9]_population.png
data/results/[WIKIDATA]kmeans[9]_population.pdf
Num Centroids: 10


  return_n_iter=True)


SILHOUETTE 0.34621812520400574
CALINSKI 24065.056742387133
data/results/[WIKIDATA]kmeans[10]_silhouette.png
data/results/[WIKIDATA]kmeans[10]_silhouette.pdf
data/results/[WIKIDATA]kmeans[10]_clusters.png
data/results/[WIKIDATA]kmeans[10]_clusters.pdf
data/results/[WIKIDATA]kmeans[10]_population.png
data/results/[WIKIDATA]kmeans[10]_population.pdf



In [None]:
df.to_pickle(os.path.join(cfg.get("directory", "exchange"), results_prefix + "001_clustering.p"))

In [18]:
for num_centroids in kernel_range:
    member_count = defaultdict(int)
    cluster = defaultdict(list)
    cluster_lbl = "kmeans_{n}".format(n=num_centroids)
    
    for i, user in tqdm(df.iterrows(), total=len(df)):
        cluster_id = user[cluster_lbl]
        member_count[cluster_id] += 1
        action_df = pd.DataFrame({"from": user['sequence']})
        action_df["to"] = action_df["from"].shift(periods=-1)
        cluster[cluster_id].append(action_df)
         
    for cluster_index in cluster:
        cluster_df = pd.concat(cluster[cluster_index])

        transition_count_pivot = lib.stretch_pivot(pd.crosstab(cluster_df['from'], cluster_df['to']), labels)
        unnormalized = transition_count_pivot.transpose().sum().transpose()
        pivot = lib.stretch_pivot(pd.crosstab(cluster_df['from'], cluster_df['to'], normalize="index"), labels)
        
        store_path = os.path.join(cfg.get("directory", "results"), results_prefix + "clusters[{n}_{i}]".format(n=num_centroids, i=cluster_index))
        plotting.transition_matrix(unnormalized, pivot, transition_count_pivot, cluster_index, store_path)
        
        """
        transition_count_pivot_reduced = lib.stretch_pivot(transition_count_pivot, filtered_states)
        unnormalized_reduced = transition_count_pivot_reduced.transpose().sum().transpose()
        pivot_reduced = lib.stretch_pivot(pd.crosstab(cluster_df['from'], cluster_df['to'], normalize="index"), filtered_states)
        
        store_path = os.path.join(cfg.get("directory", "results"), results_prefix + "clusters[{n}_{i}]reduced".format(n=num_centroids, i=cluster_index))
        plotting.transition_matrix(unnormalized, pivot, transition_count_pivot, cluster_index, store_path)
        """
        

HBox(children=(IntProgress(value=0, max=87480), HTML(value='')))


sending to plot 2
data/results/[WIKIDATA]clusters[2_0].png
data/results/[WIKIDATA]clusters[2_0].pdf
plot done 2
sending to plot 2
data/results/[WIKIDATA]clusters[2_1].png
data/results/[WIKIDATA]clusters[2_1].pdf
plot done 2


HBox(children=(IntProgress(value=0, max=87480), HTML(value='')))


sending to plot 3
data/results/[WIKIDATA]clusters[3_2].png
data/results/[WIKIDATA]clusters[3_2].pdf
plot done 3
sending to plot 3
data/results/[WIKIDATA]clusters[3_1].png
data/results/[WIKIDATA]clusters[3_1].pdf
plot done 3
sending to plot 3
data/results/[WIKIDATA]clusters[3_0].png
data/results/[WIKIDATA]clusters[3_0].pdf
plot done 3


HBox(children=(IntProgress(value=0, max=87480), HTML(value='')))


sending to plot 4
data/results/[WIKIDATA]clusters[4_1].png
data/results/[WIKIDATA]clusters[4_1].pdf
plot done 4
sending to plot 4
data/results/[WIKIDATA]clusters[4_3].png
data/results/[WIKIDATA]clusters[4_3].pdf
plot done 4
sending to plot 4
data/results/[WIKIDATA]clusters[4_0].png
data/results/[WIKIDATA]clusters[4_0].pdf
plot done 4
sending to plot 4
data/results/[WIKIDATA]clusters[4_2].png
data/results/[WIKIDATA]clusters[4_2].pdf
plot done 4


HBox(children=(IntProgress(value=0, max=87480), HTML(value='')))


sending to plot 5
data/results/[WIKIDATA]clusters[5_4].png
data/results/[WIKIDATA]clusters[5_4].pdf
plot done 5
sending to plot 5
data/results/[WIKIDATA]clusters[5_0].png
data/results/[WIKIDATA]clusters[5_0].pdf
plot done 5
sending to plot 5
data/results/[WIKIDATA]clusters[5_1].png
data/results/[WIKIDATA]clusters[5_1].pdf
plot done 5
sending to plot 5
data/results/[WIKIDATA]clusters[5_3].png
data/results/[WIKIDATA]clusters[5_3].pdf
plot done 5
sending to plot 5
data/results/[WIKIDATA]clusters[5_2].png
data/results/[WIKIDATA]clusters[5_2].pdf
plot done 5


HBox(children=(IntProgress(value=0, max=87480), HTML(value='')))


sending to plot 6
data/results/[WIKIDATA]clusters[6_3].png
data/results/[WIKIDATA]clusters[6_3].pdf
plot done 6
sending to plot 6
data/results/[WIKIDATA]clusters[6_2].png
data/results/[WIKIDATA]clusters[6_2].pdf
plot done 6
sending to plot 6
data/results/[WIKIDATA]clusters[6_1].png
data/results/[WIKIDATA]clusters[6_1].pdf
plot done 6
sending to plot 6
data/results/[WIKIDATA]clusters[6_5].png
data/results/[WIKIDATA]clusters[6_5].pdf
plot done 6
sending to plot 6
data/results/[WIKIDATA]clusters[6_4].png
data/results/[WIKIDATA]clusters[6_4].pdf
plot done 6
sending to plot 6
data/results/[WIKIDATA]clusters[6_0].png
data/results/[WIKIDATA]clusters[6_0].pdf
plot done 6


HBox(children=(IntProgress(value=0, max=87480), HTML(value='')))


sending to plot 7
data/results/[WIKIDATA]clusters[7_5].png
data/results/[WIKIDATA]clusters[7_5].pdf
plot done 7
sending to plot 7
data/results/[WIKIDATA]clusters[7_3].png
data/results/[WIKIDATA]clusters[7_3].pdf
plot done 7
sending to plot 7
data/results/[WIKIDATA]clusters[7_4].png
data/results/[WIKIDATA]clusters[7_4].pdf
plot done 7
sending to plot 7
data/results/[WIKIDATA]clusters[7_0].png
data/results/[WIKIDATA]clusters[7_0].pdf
plot done 7
sending to plot 7
data/results/[WIKIDATA]clusters[7_2].png
data/results/[WIKIDATA]clusters[7_2].pdf
plot done 7
sending to plot 7
data/results/[WIKIDATA]clusters[7_1].png
data/results/[WIKIDATA]clusters[7_1].pdf
plot done 7
sending to plot 7
data/results/[WIKIDATA]clusters[7_6].png
data/results/[WIKIDATA]clusters[7_6].pdf
plot done 7


HBox(children=(IntProgress(value=0, max=87480), HTML(value='')))


sending to plot 8
data/results/[WIKIDATA]clusters[8_6].png
data/results/[WIKIDATA]clusters[8_6].pdf
plot done 8
sending to plot 8
data/results/[WIKIDATA]clusters[8_4].png
data/results/[WIKIDATA]clusters[8_4].pdf
plot done 8
sending to plot 8
data/results/[WIKIDATA]clusters[8_0].png
data/results/[WIKIDATA]clusters[8_0].pdf
plot done 8
sending to plot 8
data/results/[WIKIDATA]clusters[8_3].png
data/results/[WIKIDATA]clusters[8_3].pdf
plot done 8
sending to plot 8
data/results/[WIKIDATA]clusters[8_2].png
data/results/[WIKIDATA]clusters[8_2].pdf
plot done 8
sending to plot 8
data/results/[WIKIDATA]clusters[8_7].png
data/results/[WIKIDATA]clusters[8_7].pdf
plot done 8
sending to plot 8
data/results/[WIKIDATA]clusters[8_1].png
data/results/[WIKIDATA]clusters[8_1].pdf
plot done 8
sending to plot 8
data/results/[WIKIDATA]clusters[8_5].png
data/results/[WIKIDATA]clusters[8_5].pdf
plot done 8


HBox(children=(IntProgress(value=0, max=87480), HTML(value='')))


sending to plot 9
data/results/[WIKIDATA]clusters[9_3].png
data/results/[WIKIDATA]clusters[9_3].pdf
plot done 9
sending to plot 9
data/results/[WIKIDATA]clusters[9_1].png
data/results/[WIKIDATA]clusters[9_1].pdf
plot done 9
sending to plot 9
data/results/[WIKIDATA]clusters[9_2].png
data/results/[WIKIDATA]clusters[9_2].pdf
plot done 9
sending to plot 9
data/results/[WIKIDATA]clusters[9_7].png
data/results/[WIKIDATA]clusters[9_7].pdf
plot done 9
sending to plot 9
data/results/[WIKIDATA]clusters[9_0].png
data/results/[WIKIDATA]clusters[9_0].pdf
plot done 9
sending to plot 9
data/results/[WIKIDATA]clusters[9_5].png
data/results/[WIKIDATA]clusters[9_5].pdf
plot done 9
sending to plot 9
data/results/[WIKIDATA]clusters[9_6].png
data/results/[WIKIDATA]clusters[9_6].pdf
plot done 9
sending to plot 9
data/results/[WIKIDATA]clusters[9_8].png
data/results/[WIKIDATA]clusters[9_8].pdf
plot done 9
sending to plot 9
data/results/[WIKIDATA]clusters[9_4].png
data/results/[WIKIDATA]clusters[9_4].pdf
plo

HBox(children=(IntProgress(value=0, max=87480), HTML(value='')))


sending to plot 10
data/results/[WIKIDATA]clusters[10_2].png
data/results/[WIKIDATA]clusters[10_2].pdf
plot done 10
sending to plot 10
data/results/[WIKIDATA]clusters[10_4].png
data/results/[WIKIDATA]clusters[10_4].pdf
plot done 10
sending to plot 10
data/results/[WIKIDATA]clusters[10_1].png
data/results/[WIKIDATA]clusters[10_1].pdf
plot done 10
sending to plot 10
data/results/[WIKIDATA]clusters[10_3].png
data/results/[WIKIDATA]clusters[10_3].pdf
plot done 10
sending to plot 10
data/results/[WIKIDATA]clusters[10_0].png
data/results/[WIKIDATA]clusters[10_0].pdf
plot done 10
sending to plot 10
data/results/[WIKIDATA]clusters[10_8].png
data/results/[WIKIDATA]clusters[10_8].pdf
plot done 10
sending to plot 10
data/results/[WIKIDATA]clusters[10_6].png
data/results/[WIKIDATA]clusters[10_6].pdf
plot done 10
sending to plot 10
data/results/[WIKIDATA]clusters[10_5].png
data/results/[WIKIDATA]clusters[10_5].pdf
plot done 10
sending to plot 10
data/results/[WIKIDATA]clusters[10_7].png
data/resul