In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
import pandas as pd

In [3]:
import lib
import plotting

In [4]:
from multiprocessing import Pool
import pickle
from collections import defaultdict

In [5]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score, calinski_harabaz_score
from sklearn.decomposition import PCA

In [6]:
from configparser import ConfigParser
cfg = ConfigParser()
cfg.read("config.cfg")

['config.cfg']

In [7]:
from tqdm._tqdm_notebook import tqdm_notebook as tqdm # we manually import the notebook submodule as the normal one struggles with jupyterlab
tqdm.pandas() # this enables us to use progress_apply instead of apply

In [8]:
results_prefix = cfg.get("results", "prefix")

sequence_file = os.path.join(cfg.get("directory", "exchange"), "[wikidata]004_sequences.p")
labels_file = os.path.join(cfg.get("directory", "exchange"), "[wikidata]004_labels_readable.p")
users_file = os.path.join(cfg.get("directory", "exchange"), "[wikidata]004_users.p")

In [9]:
df = pd.read_pickle(sequence_file)
labels = pd.read_pickle(labels_file)['label'].values
df.head()

Unnamed: 0,bot_name,bot_sequence,length,length_nobreak,sequence,sequence_readable,user_id,user_name
0,False,False,1517,1385,"[REVERT_MANUAL, BREAK, CLAIM_CREATE, BREAK, CL...","[Revert Item, Break, Create Claim, Break, Crea...",1,Hoo man
1,False,False,9,6,"[ENTITY_CREATE, BREAK, SITELINK_ADD, BREAK, ME...","[Create Item, Break, Add Sitelink, Break, Merg...",1000036,JShenk
2,False,False,7,4,"[SITELINK_ADD, BREAK, SITELINK_ADD, BREAK, SIT...","[Add Sitelink, Break, Add Sitelink, Break, Add...",1000078,Egor-belikov
3,False,False,9,6,"[ENTITY_CREATE, BREAK, SITELINK_REMOVE, SITELI...","[Create Item, Break, Remove Sitelink, Add Site...",100008,Wars
4,False,False,44,23,"[SITELINK_ADD, BREAK, SITELINK_ADD, BREAK, SIT...","[Add Sitelink, Break, Add Sitelink, Break, Add...",100012,Kane 14


In [10]:
labels

array(['Add Description', 'Add Item Alias', 'Add Label', 'Add Qualifier',
       'Add Reference', 'Add Sitelink', 'Break', 'Create Claim',
       'Create Item', 'Create Property', 'Edit Alias', 'Edit Claim',
       'Edit Claim Value', 'Edit Description', 'Edit Item', 'Edit Label',
       'Edit Qualifier', 'Edit Reference', 'Edit Sitelink', 'Merge Items',
       'Override Item', 'Protect Item', 'Redirect Item', 'Remove Alias',
       'Remove Claim', 'Remove Description', 'Remove Item',
       'Remove Label', 'Remove Qualifier', 'Remove Reference',
       'Remove Sitelink', 'Revert Item'], dtype=object)

In [113]:
kernel_range = np.arange(cfg.getint("kmeans", "clusters_min"), cfg.getint("kmeans", "clusters_max") + 1)
kernel_range = np.array([int(x) for x in cfg.get("kmeans", "clusters_range").split(",")])
kernel_range

array([4, 8])

In [12]:
# filter out short sequences. This is a different filter than the preprocessing one!
print("Len unfiltered: {n}".format(n=len(df)))
df = df.loc[df["length_nobreak"] >= cfg.getint("kmeans", "min_num_changes")].reset_index(drop=True)
print("Len filtered: {n}".format(n=len(df)))

Len unfiltered: 88148
Len filtered: 38429


In [13]:
def calc_dist(sequence):
    dist, pivot = lib.calc_distribution(labels.copy(), sequence)
    return np.array(dist)

with Pool(cfg.getint("core", "num_cores")) as processor_pool:
    df['stat_dist'] = processor_pool.map(calc_dist, tqdm(df['sequence_readable']))

HBox(children=(IntProgress(value=0, max=38429), HTML(value='')))




In [14]:
df.tail()

Unnamed: 0,bot_name,bot_sequence,length,length_nobreak,sequence,sequence_readable,user_id,user_name,stat_dist
38424,False,False,555,493,"[CLAIM_CREATE, CLAIM_UPDATE, CLAIM_CREATE, CLA...","[Create Claim, Edit Claim, Create Claim, Creat...",999054,Acebarry,"[0.026173285198555985, 0.02586206896551725, 0...."
38425,False,False,83,44,"[ENTITY_CREATE, BREAK, ENTITY_CREATE, SITELINK...","[Create Item, Break, Create Item, Edit Sitelin...",99933,Biagio2103,"[0.025903359039922972, 0.02590335903992298, 0...."
38426,False,False,85,50,"[SITELINK_ADD, BREAK, SITELINK_ADD, BREAK, SIT...","[Add Sitelink, Break, Add Sitelink, Break, Add...",99995,Gyimhu,"[0.027914614121510667, 0.025862068965517244, 0..."
38427,False,False,30,25,"[DESCRIPTION_ADD, DESCRIPTION_ADD, DESCRIPTION...","[Add Description, Add Description, Add Descrip...",999951,Mongrangvebet,"[0.1447681331747919, 0.025862068965517238, 0.0..."
38428,False,False,24,14,"[SITELINK_UPDATE, BREAK, SITELINK_UPDATE, BREA...","[Edit Sitelink, Break, Edit Sitelink, Break, E...",999994,Alena Pokorná,"[0.025961231375193083, 0.025961231375193083, 0..."


In [119]:
df['length'].mean()

1986.6067032709673

In [15]:
df.loc[pd.isna(df['stat_dist'])]

Unnamed: 0,bot_name,bot_sequence,length,length_nobreak,sequence,sequence_readable,user_id,user_name,stat_dist


In [16]:
vectors = np.array([np.array(x) for x in df['stat_dist'].values]) # ensure we have the data in numpy format
pca = PCA(n_components=3)
plot_vectors = pca.fit_transform(vectors)
plot_variance_ratios = pca.explained_variance_ratio_
print(plot_variance_ratios.cumsum())

[0.41699852 0.59538204 0.72375929]


In [85]:
centroids = {}    
centroids_file = os.path.join(cfg.get("directory", "exchange"), results_prefix + "001_centroids.p")
load_centroids = cfg.getboolean("kmeans", "load_centroids") and os.path.isfile(centroids_file)
for num_centroids in tqdm(kernel_range):    
    kmeans = None

    print("Num Centroids: {n}".format(n=num_centroids))
    if load_centroids:
        with open(centroids_file, "rb") as dump_file:
            centroids = pickle.load(dump_file)
        kmeans = KMeans(n_clusters=num_centroids, init=centroids[num_centroids])
    else:
        kmeans = KMeans(n_clusters=num_centroids, random_state=cfg.getint("kmeans", "random_state"))
        
    kmeans.fit_predict(vectors)
    
    lbl = kmeans.labels_
    centroids[num_centroids] = kmeans.cluster_centers_
    
    silhouette_avg = silhouette_score(vectors, lbl)
    print("Average Silhouette Coefficience: {s}".format(s=silhouette_avg))
    calinski_score = calinski_harabaz_score(vectors, lbl)
    print("Calinksi-Harabaz Score: {s}".format(s=calinski_score))
    sample_silhouette_values = silhouette_samples(vectors, lbl)
    
    store_path = os.path.join(cfg.get("directory", "results"), results_prefix + "kmeans[{n}]".format(n=num_centroids))
    cluster_lbl = "kmeans_{n}".format(n=num_centroids)
    df[cluster_lbl] = lbl
    plotting.k_means(plot_vectors, plot_variance_ratios, num_centroids, lbl, sample_silhouette_values, silhouette_avg, store_path)
    
if cfg.getboolean("kmeans", "store_centroids"):
    with open(centroids_file, "wb") as dump_file:
        pickle.dump(centroids, dump_file)

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))

Num Centroids: 2


  return_n_iter=True)


Average Silhouette Coefficience: 0.33872547867114816
Calinksi-Harabaz Score: 18349.84667396173
data/results/[WIKIDATA_10]kmeans[2]_silhouette.png
data/results/[WIKIDATA_10]kmeans[2]_silhouette.pdf
data/results/[WIKIDATA_10]kmeans[2]_clusters.png
data/results/[WIKIDATA_10]kmeans[2]_clusters.pdf
data/results/[WIKIDATA_10]kmeans[2]_clusters_planar.png
data/results/[WIKIDATA_10]kmeans[2]_clusters_planar.pdf
data/results/[WIKIDATA_10]kmeans[2]_population.png
data/results/[WIKIDATA_10]kmeans[2]_population.pdf
Num Centroids: 3


  return_n_iter=True)


Average Silhouette Coefficience: 0.3213767432181204
Calinksi-Harabaz Score: 17168.74183263665
data/results/[WIKIDATA_10]kmeans[3]_silhouette.png
data/results/[WIKIDATA_10]kmeans[3]_silhouette.pdf
data/results/[WIKIDATA_10]kmeans[3]_clusters.png
data/results/[WIKIDATA_10]kmeans[3]_clusters.pdf
data/results/[WIKIDATA_10]kmeans[3]_clusters_planar.png
data/results/[WIKIDATA_10]kmeans[3]_clusters_planar.pdf
data/results/[WIKIDATA_10]kmeans[3]_population.png
data/results/[WIKIDATA_10]kmeans[3]_population.pdf
Num Centroids: 4


  return_n_iter=True)


Average Silhouette Coefficience: 0.3630671646489982
Calinksi-Harabaz Score: 17143.601574092856
data/results/[WIKIDATA_10]kmeans[4]_silhouette.png
data/results/[WIKIDATA_10]kmeans[4]_silhouette.pdf
data/results/[WIKIDATA_10]kmeans[4]_clusters.png
data/results/[WIKIDATA_10]kmeans[4]_clusters.pdf
data/results/[WIKIDATA_10]kmeans[4]_clusters_planar.png
data/results/[WIKIDATA_10]kmeans[4]_clusters_planar.pdf
data/results/[WIKIDATA_10]kmeans[4]_population.png
data/results/[WIKIDATA_10]kmeans[4]_population.pdf
Num Centroids: 5


  return_n_iter=True)


Average Silhouette Coefficience: 0.355519547705662
Calinksi-Harabaz Score: 15709.819307575593
data/results/[WIKIDATA_10]kmeans[5]_silhouette.png
data/results/[WIKIDATA_10]kmeans[5]_silhouette.pdf
data/results/[WIKIDATA_10]kmeans[5]_clusters.png
data/results/[WIKIDATA_10]kmeans[5]_clusters.pdf
data/results/[WIKIDATA_10]kmeans[5]_clusters_planar.png
data/results/[WIKIDATA_10]kmeans[5]_clusters_planar.pdf
data/results/[WIKIDATA_10]kmeans[5]_population.png
data/results/[WIKIDATA_10]kmeans[5]_population.pdf
Num Centroids: 6


  return_n_iter=True)


Average Silhouette Coefficience: 0.288789152558481
Calinksi-Harabaz Score: 14545.207617016478
data/results/[WIKIDATA_10]kmeans[6]_silhouette.png
data/results/[WIKIDATA_10]kmeans[6]_silhouette.pdf
data/results/[WIKIDATA_10]kmeans[6]_clusters.png
data/results/[WIKIDATA_10]kmeans[6]_clusters.pdf
data/results/[WIKIDATA_10]kmeans[6]_clusters_planar.png
data/results/[WIKIDATA_10]kmeans[6]_clusters_planar.pdf
data/results/[WIKIDATA_10]kmeans[6]_population.png
data/results/[WIKIDATA_10]kmeans[6]_population.pdf
Num Centroids: 7


  return_n_iter=True)


Average Silhouette Coefficience: 0.2915417479788477
Calinksi-Harabaz Score: 13472.952817917747
data/results/[WIKIDATA_10]kmeans[7]_silhouette.png
data/results/[WIKIDATA_10]kmeans[7]_silhouette.pdf
data/results/[WIKIDATA_10]kmeans[7]_clusters.png
data/results/[WIKIDATA_10]kmeans[7]_clusters.pdf
data/results/[WIKIDATA_10]kmeans[7]_clusters_planar.png
data/results/[WIKIDATA_10]kmeans[7]_clusters_planar.pdf
data/results/[WIKIDATA_10]kmeans[7]_population.png
data/results/[WIKIDATA_10]kmeans[7]_population.pdf
Num Centroids: 8


  return_n_iter=True)


Average Silhouette Coefficience: 0.2938757616138862
Calinksi-Harabaz Score: 12778.008198029946
data/results/[WIKIDATA_10]kmeans[8]_silhouette.png
data/results/[WIKIDATA_10]kmeans[8]_silhouette.pdf
data/results/[WIKIDATA_10]kmeans[8]_clusters.png
data/results/[WIKIDATA_10]kmeans[8]_clusters.pdf
data/results/[WIKIDATA_10]kmeans[8]_clusters_planar.png
data/results/[WIKIDATA_10]kmeans[8]_clusters_planar.pdf
data/results/[WIKIDATA_10]kmeans[8]_population.png
data/results/[WIKIDATA_10]kmeans[8]_population.pdf
Num Centroids: 9


  return_n_iter=True)


Average Silhouette Coefficience: 0.29066016614284335
Calinksi-Harabaz Score: 12347.531379953112
data/results/[WIKIDATA_10]kmeans[9]_silhouette.png
data/results/[WIKIDATA_10]kmeans[9]_silhouette.pdf
data/results/[WIKIDATA_10]kmeans[9]_clusters.png
data/results/[WIKIDATA_10]kmeans[9]_clusters.pdf
data/results/[WIKIDATA_10]kmeans[9]_clusters_planar.png
data/results/[WIKIDATA_10]kmeans[9]_clusters_planar.pdf
data/results/[WIKIDATA_10]kmeans[9]_population.png
data/results/[WIKIDATA_10]kmeans[9]_population.pdf
Num Centroids: 10


  return_n_iter=True)


Average Silhouette Coefficience: 0.294101342255147
Calinksi-Harabaz Score: 11947.400051660221
data/results/[WIKIDATA_10]kmeans[10]_silhouette.png
data/results/[WIKIDATA_10]kmeans[10]_silhouette.pdf
data/results/[WIKIDATA_10]kmeans[10]_clusters.png
data/results/[WIKIDATA_10]kmeans[10]_clusters.pdf
data/results/[WIKIDATA_10]kmeans[10]_clusters_planar.png
data/results/[WIKIDATA_10]kmeans[10]_clusters_planar.pdf
data/results/[WIKIDATA_10]kmeans[10]_population.png
data/results/[WIKIDATA_10]kmeans[10]_population.pdf


In [18]:
df.to_pickle(os.path.join(cfg.get("directory", "exchange"), results_prefix + "001_clustering.p"))

In [19]:
#load here to skip previous calculations if you already have em
df = pd.read_pickle(os.path.join(cfg.get("directory", "exchange"), results_prefix + "001_clustering.p"))

In [120]:
labels_filtered = sorted(['Add Description', 'Add Label', 'Add Qualifier', 'Add Reference', 'Add Sitelink', 'Break', 'Create Claim', 'Create Item', 'Edit Claim', 'Remove Claim'])
labels_filtered = ["Break", "Create Claim", "Edit Claim", "Remove Claim", "Add Description", "Edit Description", "Merge Items", "Add Sitelink", "Edit Sitelink"]
labels_filtered = ["Break", "Create Item", "Merge Items", "Create Claim", "Edit Claim", "Remove Claim", "Add Reference", "Add Label", "Add Description", "Add Sitelink", "Edit Sitelink"]
labels_filtered

['Break',
 'Create Item',
 'Merge Items',
 'Create Claim',
 'Edit Claim',
 'Remove Claim',
 'Add Reference',
 'Add Label',
 'Add Description',
 'Add Sitelink',
 'Edit Sitelink']

In [121]:
df.head()

Unnamed: 0,bot_name,bot_sequence,length,length_nobreak,sequence,sequence_readable,user_id,user_name,stat_dist,kmeans_4,kmeans_8,kmeans_2,kmeans_3,kmeans_5,kmeans_6,kmeans_7,kmeans_9,kmeans_10
0,False,False,1517,1385,"[REVERT_MANUAL, BREAK, CLAIM_CREATE, BREAK, CL...","[Revert Item, Break, Create Claim, Break, Crea...",1,Hoo man,"[0.033592832488441825, 0.029154644609674315, 0...",0,3,0,0,3,3,1,0,0
1,False,False,44,23,"[SITELINK_ADD, BREAK, SITELINK_ADD, BREAK, SIT...","[Add Sitelink, Break, Add Sitelink, Break, Add...",100012,Kane 14,"[0.02586206896551723, 0.025862068965517227, 0....",2,1,1,1,1,0,5,1,1
2,False,False,17,12,"[LABEL_UPDATE, ALIAS_ADD, BREAK, CLAIM_CREATE,...","[Edit Label, Add Item Alias, Break, Create Cla...",1000197,Zaizone,"[0.026205469193709592, 0.07779210211701826, 0....",1,5,1,2,4,1,0,6,2
3,False,False,91,82,"[SITELINK_UPDATE, BREAK, LABEL_ADD, CLAIM_CREA...","[Edit Sitelink, Break, Add Label, Create Claim...",1001159,Gogo hr,"[0.04118566424353361, 0.03162035177772598, 0.0...",0,0,0,0,3,4,6,7,4
4,False,False,21,12,"[DESCRIPTION_ADD, BREAK, DESCRIPTION_UPDATE, D...","[Add Description, Break, Edit Description, Add...",1001194,Phylosofo,"[0.08620689655172406, 0.025862068965517255, 0....",1,7,1,2,0,5,4,8,5


In [122]:
sep_label = cfg.get("preprocessing", "separator_label")
sep_label

'SEP'

In [123]:
kernel_range

array([4, 8])

In [132]:
for num_centroids in kernel_range:   
    centroids = dict()
    cluster_lbl = "kmeans_{n}".format(n=num_centroids)
    print("Current k: {k}".format(k=num_centroids))
    
    for cluster_id, cluster_members in tqdm(df.groupby(cluster_lbl)):
        stat_dists = []
        actions = []#np.array([])
        counts = []
        for seq, stat in cluster_members[["sequence_readable", "stat_dist"]].values:
            stat_dists.append(stat)
            actions.append(np.concatenate((seq, [sep_label])))
        
        actions = np.concatenate(actions)
        stat_dists_df = pd.DataFrame(stat_dists, columns=labels)
        stat_dist_mean = stat_dists_df.mean()
        stat_dist_std = stat_dists_df.std()
        action_df = pd.DataFrame({'from': actions})
        action_df['to'] = action_df['from'].shift(periods=-1)

        pivot = pd.crosstab(action_df['from'], action_df['to'])
        pivot_norm = pd.crosstab(action_df['from'], action_df['to'], normalize="index")
        
        for lbl_set_name, lbl_set in [("full", labels), ("top", labels_filtered)]:
            pivot_full = lib.stretch_pivot(pivot.copy(), lbl_set)
            pivot_full_norm = lib.stretch_pivot(pivot_norm.copy(), lbl_set)
            counts = pivot_full.transpose().sum().transpose()
            #display(pivot_full_norm)
            plot_size = (1, 1)
            if lbl_set_name == "full":
                plot_size = (46, 60)
            elif lbl_set_name == "top":
                plot_size = (16, 22)
                
            centroids[cluster_id] = (stat_dist_mean[labels].values, stat_dist_std[labels].values)
            
            store_path = os.path.join(cfg.get("directory", "results"), results_prefix + "[kmeans]clusters[{n}_{i}][{s}]_log".format(n=num_centroids, i=cluster_id, s=lbl_set_name))
            plotting.transition_matrix(plot_size, counts, pivot_full_norm, None, cluster_id, store_path, upper_limit=False, logscale=True, stat_dist=(stat_dist_mean[lbl_set].values, stat_dist_std[lbl_set].values, 0.15))

            store_path = os.path.join(cfg.get("directory", "results"), results_prefix + "[kmeans]clusters[{n}_{i}][{s}]".format(n=num_centroids, i=cluster_id, s=lbl_set_name))
            plotting.transition_matrix(plot_size, counts, pivot_full_norm, None, cluster_id, store_path, upper_limit=False, logscale=False, stat_dist=(stat_dist_mean[lbl_set].values, stat_dist_std[lbl_set].values, 0.15))
        
  
    store_path = os.path.join(cfg.get("directory", "results"), results_prefix + "[kmeans]clusters[{n}]centroids".format(n=num_centroids))                                                                                                                                                
    plotting.cluster_centroids(centroids, labels, store_path)

Current k: 4


HBox(children=(IntProgress(value=0, max=4), HTML(value='')))



data/results/[WIKIDATA_10][kmeans]clusters[4_0][full]_log.png
data/results/[WIKIDATA_10][kmeans]clusters[4_0][full]_log.pdf
data/results/[WIKIDATA_10][kmeans]clusters[4_0][full]_log_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[4_0][full]_log_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[4_0][full].png
data/results/[WIKIDATA_10][kmeans]clusters[4_0][full].pdf
data/results/[WIKIDATA_10][kmeans]clusters[4_0][full]_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[4_0][full]_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[4_0][top]_log.png
data/results/[WIKIDATA_10][kmeans]clusters[4_0][top]_log.pdf
data/results/[WIKIDATA_10][kmeans]clusters[4_0][top]_log_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[4_0][top]_log_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[4_0][top].png
data/results/[WIKIDATA_10][kmeans]clusters[4_0][top].pdf
data/results/[WIKIDATA_10][kmeans]clusters[4_0][top]_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[4_0][top]_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[4_1][full]_log.png
data/results/[WIKIDATA_10][kmeans]clusters[4_1][full]_log.pdf
data/results/[WIKIDATA_10][kmeans]clusters[4_1][full]_log_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[4_1][full]_log_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[4_1][full].png
data/results/[WIKIDATA_10][kmeans]clusters[4_1][full].pdf
data/results/[WIKIDATA_10][kmeans]clusters[4_1][full]_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[4_1][full]_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[4_1][top]_log.png
data/results/[WIKIDATA_10][kmeans]clusters[4_1][top]_log.pdf
data/results/[WIKIDATA_10][kmeans]clusters[4_1][top]_log_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[4_1][top]_log_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[4_1][top].png
data/results/[WIKIDATA_10][kmeans]clusters[4_1][top].pdf
data/results/[WIKIDATA_10][kmeans]clusters[4_1][top]_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[4_1][top]_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[4_2][full]_log.png
data/results/[WIKIDATA_10][kmeans]clusters[4_2][full]_log.pdf
data/results/[WIKIDATA_10][kmeans]clusters[4_2][full]_log_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[4_2][full]_log_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[4_2][full].png
data/results/[WIKIDATA_10][kmeans]clusters[4_2][full].pdf
data/results/[WIKIDATA_10][kmeans]clusters[4_2][full]_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[4_2][full]_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[4_2][top]_log.png
data/results/[WIKIDATA_10][kmeans]clusters[4_2][top]_log.pdf
data/results/[WIKIDATA_10][kmeans]clusters[4_2][top]_log_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[4_2][top]_log_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[4_2][top].png
data/results/[WIKIDATA_10][kmeans]clusters[4_2][top].pdf
data/results/[WIKIDATA_10][kmeans]clusters[4_2][top]_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[4_2][top]_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[4_3][full]_log.png
data/results/[WIKIDATA_10][kmeans]clusters[4_3][full]_log.pdf
data/results/[WIKIDATA_10][kmeans]clusters[4_3][full]_log_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[4_3][full]_log_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[4_3][full].png
data/results/[WIKIDATA_10][kmeans]clusters[4_3][full].pdf
data/results/[WIKIDATA_10][kmeans]clusters[4_3][full]_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[4_3][full]_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[4_3][top]_log.png
data/results/[WIKIDATA_10][kmeans]clusters[4_3][top]_log.pdf
data/results/[WIKIDATA_10][kmeans]clusters[4_3][top]_log_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[4_3][top]_log_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[4_3][top].png
data/results/[WIKIDATA_10][kmeans]clusters[4_3][top].pdf
data/results/[WIKIDATA_10][kmeans]clusters[4_3][top]_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[4_3][top]_trim.pdf
data/results/[WIKIDATA_10][kmeans]clusters[4]centroids.png
data/results/[WIKIDATA_10][kmeans]clusters[4]centroids.pdf
data/results/[WIKIDATA_10][kmeans]clusters[4]centroids_legend.png
data/results/[WIKIDATA_10][kmeans]clusters[4]centroids_legend.pdf
Current k: 8


HBox(children=(IntProgress(value=0, max=8), HTML(value='')))



data/results/[WIKIDATA_10][kmeans]clusters[8_0][full]_log.png
data/results/[WIKIDATA_10][kmeans]clusters[8_0][full]_log.pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_0][full]_log_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_0][full]_log_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_0][full].png
data/results/[WIKIDATA_10][kmeans]clusters[8_0][full].pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_0][full]_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_0][full]_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_0][top]_log.png
data/results/[WIKIDATA_10][kmeans]clusters[8_0][top]_log.pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_0][top]_log_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_0][top]_log_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_0][top].png
data/results/[WIKIDATA_10][kmeans]clusters[8_0][top].pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_0][top]_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_0][top]_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_1][full]_log.png
data/results/[WIKIDATA_10][kmeans]clusters[8_1][full]_log.pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_1][full]_log_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_1][full]_log_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_1][full].png
data/results/[WIKIDATA_10][kmeans]clusters[8_1][full].pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_1][full]_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_1][full]_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_1][top]_log.png
data/results/[WIKIDATA_10][kmeans]clusters[8_1][top]_log.pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_1][top]_log_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_1][top]_log_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_1][top].png
data/results/[WIKIDATA_10][kmeans]clusters[8_1][top].pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_1][top]_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_1][top]_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_2][full]_log.png
data/results/[WIKIDATA_10][kmeans]clusters[8_2][full]_log.pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_2][full]_log_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_2][full]_log_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_2][full].png
data/results/[WIKIDATA_10][kmeans]clusters[8_2][full].pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_2][full]_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_2][full]_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_2][top]_log.png
data/results/[WIKIDATA_10][kmeans]clusters[8_2][top]_log.pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_2][top]_log_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_2][top]_log_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_2][top].png
data/results/[WIKIDATA_10][kmeans]clusters[8_2][top].pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_2][top]_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_2][top]_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_3][full]_log.png
data/results/[WIKIDATA_10][kmeans]clusters[8_3][full]_log.pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_3][full]_log_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_3][full]_log_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_3][full].png
data/results/[WIKIDATA_10][kmeans]clusters[8_3][full].pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_3][full]_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_3][full]_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_3][top]_log.png
data/results/[WIKIDATA_10][kmeans]clusters[8_3][top]_log.pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_3][top]_log_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_3][top]_log_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_3][top].png
data/results/[WIKIDATA_10][kmeans]clusters[8_3][top].pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_3][top]_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_3][top]_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_4][full]_log.png
data/results/[WIKIDATA_10][kmeans]clusters[8_4][full]_log.pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_4][full]_log_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_4][full]_log_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_4][full].png
data/results/[WIKIDATA_10][kmeans]clusters[8_4][full].pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_4][full]_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_4][full]_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_4][top]_log.png
data/results/[WIKIDATA_10][kmeans]clusters[8_4][top]_log.pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_4][top]_log_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_4][top]_log_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_4][top].png
data/results/[WIKIDATA_10][kmeans]clusters[8_4][top].pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_4][top]_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_4][top]_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_5][full]_log.png
data/results/[WIKIDATA_10][kmeans]clusters[8_5][full]_log.pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_5][full]_log_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_5][full]_log_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_5][full].png
data/results/[WIKIDATA_10][kmeans]clusters[8_5][full].pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_5][full]_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_5][full]_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_5][top]_log.png
data/results/[WIKIDATA_10][kmeans]clusters[8_5][top]_log.pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_5][top]_log_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_5][top]_log_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_5][top].png
data/results/[WIKIDATA_10][kmeans]clusters[8_5][top].pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_5][top]_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_5][top]_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_6][full]_log.png
data/results/[WIKIDATA_10][kmeans]clusters[8_6][full]_log.pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_6][full]_log_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_6][full]_log_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_6][full].png
data/results/[WIKIDATA_10][kmeans]clusters[8_6][full].pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_6][full]_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_6][full]_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_6][top]_log.png
data/results/[WIKIDATA_10][kmeans]clusters[8_6][top]_log.pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_6][top]_log_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_6][top]_log_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_6][top].png
data/results/[WIKIDATA_10][kmeans]clusters[8_6][top].pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_6][top]_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_6][top]_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_7][full]_log.png
data/results/[WIKIDATA_10][kmeans]clusters[8_7][full]_log.pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_7][full]_log_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_7][full]_log_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_7][full].png
data/results/[WIKIDATA_10][kmeans]clusters[8_7][full].pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_7][full]_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_7][full]_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_7][top]_log.png
data/results/[WIKIDATA_10][kmeans]clusters[8_7][top]_log.pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_7][top]_log_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_7][top]_log_trim.pdf




data/results/[WIKIDATA_10][kmeans]clusters[8_7][top].png
data/results/[WIKIDATA_10][kmeans]clusters[8_7][top].pdf
data/results/[WIKIDATA_10][kmeans]clusters[8_7][top]_trim.png
data/results/[WIKIDATA_10][kmeans]clusters[8_7][top]_trim.pdf
data/results/[WIKIDATA_10][kmeans]clusters[8]centroids.png
data/results/[WIKIDATA_10][kmeans]clusters[8]centroids.pdf
data/results/[WIKIDATA_10][kmeans]clusters[8]centroids_legend.png
data/results/[WIKIDATA_10][kmeans]clusters[8]centroids_legend.pdf
