In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
import pandas as pd

In [3]:
import lib
import plotting

In [4]:
from multiprocessing import Pool
import pickle
from collections import defaultdict

In [5]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score, calinski_harabaz_score
from sklearn.decomposition import PCA

In [6]:
from configparser import ConfigParser
cfg = ConfigParser()
cfg.read("config.cfg")

['config.cfg']

In [7]:
from tqdm._tqdm_notebook import tqdm_notebook as tqdm # we manually import the notebook submodule as the normal one struggles with jupyterlab
tqdm.pandas() # this enables us to use progress_apply instead of apply

In [8]:
results_prefix = cfg.get("results", "prefix")

sequence_file = os.path.join(cfg.get("directory", "exchange"), "[wikidata]004_sequences.p")
labels_file = os.path.join(cfg.get("directory", "exchange"), "[wikidata]004_labels_readable.p")
users_file = os.path.join(cfg.get("directory", "exchange"), "[wikidata]004_users.p")

In [9]:
df = pd.read_pickle(sequence_file)
labels = pd.read_pickle(labels_file)['label'].values
df.head()

Unnamed: 0,bot_name,bot_sequence,length,length_nobreak,sequence,sequence_readable,user_id,user_name
0,False,False,1517,1385,"[REVERT_MANUAL, BREAK, CLAIM_CREATE, BREAK, CL...","[Revert Item, Break, Create Claim, Break, Crea...",1,Hoo man
1,False,False,9,6,"[ENTITY_CREATE, BREAK, SITELINK_ADD, BREAK, ME...","[Create Item, Break, Add Sitelink, Break, Merg...",1000036,JShenk
2,False,False,7,4,"[SITELINK_ADD, BREAK, SITELINK_ADD, BREAK, SIT...","[Add Sitelink, Break, Add Sitelink, Break, Add...",1000078,Egor-belikov
3,False,False,9,6,"[ENTITY_CREATE, BREAK, SITELINK_REMOVE, SITELI...","[Create Item, Break, Remove Sitelink, Add Site...",100008,Wars
4,False,False,44,23,"[SITELINK_ADD, BREAK, SITELINK_ADD, BREAK, SIT...","[Add Sitelink, Break, Add Sitelink, Break, Add...",100012,Kane 14


In [10]:
labels

array(['Add Description', 'Add Item Alias', 'Add Label', 'Add Qualifier',
       'Add Reference', 'Add Sitelink', 'Break', 'Create Claim',
       'Create Item', 'Create Property', 'Edit Alias', 'Edit Claim',
       'Edit Claim Value', 'Edit Description', 'Edit Item', 'Edit Label',
       'Edit Qualifier', 'Edit Reference', 'Edit Sitelink', 'Merge Items',
       'Override Item', 'Protect Item', 'Redirect Item', 'Remove Alias',
       'Remove Claim', 'Remove Description', 'Remove Item',
       'Remove Label', 'Remove Qualifier', 'Remove Reference',
       'Remove Sitelink', 'Revert Item'], dtype=object)

In [11]:
kernel_range = np.arange(cfg.getint("kmeans", "clusters_min"), cfg.getint("kmeans", "clusters_max") + 1)
kernel_range

array([ 2,  3,  4,  5,  6,  7,  8,  9, 10])

In [11]:
# filter out short sequences. This is a different filter than the preprocessing one!
print("Len unfiltered: {n}".format(n=len(df)))
df = df.loc[df["length_nobreak"] >= cfg.getint("kmeans", "min_num_changes")].reset_index(drop=True)
print("Len filtered: {n}".format(n=len(df)))

Len unfiltered: 88148
Len filtered: 38429


In [13]:
def calc_dist(sequence):
    dist, pivot = lib.calc_distribution(labels.copy(), sequence)
    return np.array(dist)

with Pool(cfg.getint("core", "num_cores")) as processor_pool:
    df['stat_dist'] = processor_pool.map(calc_dist, tqdm(df['sequence_readable']))

HBox(children=(IntProgress(value=0, max=38429), HTML(value='')))




In [14]:
df.tail()

Unnamed: 0,bot_name,bot_sequence,length,length_nobreak,sequence,sequence_readable,user_id,user_name,stat_dist
38424,False,False,555,493,"[CLAIM_CREATE, CLAIM_UPDATE, CLAIM_CREATE, CLA...","[Create Claim, Edit Claim, Create Claim, Creat...",999054,Acebarry,"[0.02617328519855593, 0.025862068965517272, 0...."
38425,False,False,83,44,"[ENTITY_CREATE, BREAK, ENTITY_CREATE, SITELINK...","[Create Item, Break, Create Item, Edit Sitelin...",99933,Biagio2103,"[0.025903359039922972, 0.02590335903992298, 0...."
38426,False,False,85,50,"[SITELINK_ADD, BREAK, SITELINK_ADD, BREAK, SIT...","[Add Sitelink, Break, Add Sitelink, Break, Add...",99995,Gyimhu,"[0.027914614121510663, 0.025862068965517248, 0..."
38427,False,False,30,25,"[DESCRIPTION_ADD, DESCRIPTION_ADD, DESCRIPTION...","[Add Description, Add Description, Add Descrip...",999951,Mongrangvebet,"[0.14476813317479206, 0.02586206896551725, 0.0..."
38428,False,False,24,14,"[SITELINK_UPDATE, BREAK, SITELINK_UPDATE, BREA...","[Edit Sitelink, Break, Edit Sitelink, Break, E...",999994,Alena Pokorná,"[0.025961231375193076, 0.025961231375193076, 0..."


In [15]:
df.loc[pd.isna(df['stat_dist'])]

Unnamed: 0,bot_name,bot_sequence,length,length_nobreak,sequence,sequence_readable,user_id,user_name,stat_dist


In [16]:
vectors = np.array([np.array(x) for x in df['stat_dist'].values]) # ensure we have the data in numpy format
pca = PCA(n_components=3)
plot_vectors = pca.fit_transform(vectors)
print(pca.explained_variance_ratio_.cumsum())

[0.41699852 0.59538204 0.72375929]


In [17]:
centroids = {}    
centroids_file = os.path.join(cfg.get("directory", "exchange"), results_prefix + "001_centroids.p")
load_centroids = cfg.getboolean("kmeans", "load_centroids") and os.path.isfile(centroids_file)
for num_centroids in tqdm(kernel_range):    
    kmeans = None

    print("Num Centroids: {n}".format(n=num_centroids))
    if load_centroids:
        with open(centroids_file, "rb") as dump_file:
            centroids = pickle.load(dump_file)
        kmeans = KMeans(n_clusters=num_centroids, init=centroids[num_centroids])
    else:
        kmeans = KMeans(n_clusters=num_centroids, random_state=cfg.getint("kmeans", "random_state"))
        
    kmeans.fit_predict(vectors)
    
    lbl = kmeans.labels_
    centroids[num_centroids] = kmeans.cluster_centers_

    
    silhouette_avg = silhouette_score(vectors, lbl)#
    print("SILHOUETTE", silhouette_avg)
    calinski_score = calinski_harabaz_score(vectors, lbl)
    print("CALINSKI", calinski_score)
    sample_silhouette_values = silhouette_samples(vectors, lbl)
    
    store_path = os.path.join(cfg.get("directory", "results"), results_prefix + "kmeans[{n}]".format(n=num_centroids))
    cluster_lbl = "kmeans_{n}".format(n=num_centroids)
    df[cluster_lbl] = lbl
    plotting.k_means(plot_vectors, num_centroids, lbl, sample_silhouette_values, silhouette_avg, store_path)
    
if cfg.getboolean("kmeans", "store_centroids"):
    with open(centroids_file, "wb") as dump_file:
        pickle.dump(centroids, dump_file)

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))

Num Centroids: 2
SILHOUETTE 0.33872871693269835
CALINSKI 18349.844050893935
data/results/WIKIDATA_10kmeans[2]_silhouette.png
data/results/WIKIDATA_10kmeans[2]_silhouette.pdf
data/results/WIKIDATA_10kmeans[2]_clusters.png
data/results/WIKIDATA_10kmeans[2]_clusters.pdf
data/results/WIKIDATA_10kmeans[2]_clusters_planar.png
data/results/WIKIDATA_10kmeans[2]_clusters_planar.pdf
data/results/WIKIDATA_10kmeans[2]_population.png
data/results/WIKIDATA_10kmeans[2]_population.pdf
Num Centroids: 3
SILHOUETTE 0.3213767432181129
CALINSKI 17168.74183263665
data/results/WIKIDATA_10kmeans[3]_silhouette.png
data/results/WIKIDATA_10kmeans[3]_silhouette.pdf
data/results/WIKIDATA_10kmeans[3]_clusters.png
data/results/WIKIDATA_10kmeans[3]_clusters.pdf
data/results/WIKIDATA_10kmeans[3]_clusters_planar.png
data/results/WIKIDATA_10kmeans[3]_clusters_planar.pdf
data/results/WIKIDATA_10kmeans[3]_population.png
data/results/WIKIDATA_10kmeans[3]_population.pdf
Num Centroids: 4
SILHOUETTE 0.3630589338466019
CALINSK

In [18]:
df.to_pickle(os.path.join(cfg.get("directory", "exchange"), results_prefix + "001_clustering.p"))

In [12]:
#load here to skip previous calculations if you already have em
df = pd.read_pickle(os.path.join(cfg.get("directory", "exchange"), results_prefix + "001_clustering.p"))

In [13]:
labels_filtered = sorted(['Add Description', 'Add Label', 'Add Qualifier', 'Add Reference', 'Add Sitelink', 'Break', 'Create Claim', 'Create Item', 'Edit Claim', 'Remove Claim'])
labels_filtered

['Add Description',
 'Add Label',
 'Add Qualifier',
 'Add Reference',
 'Add Sitelink',
 'Break',
 'Create Claim',
 'Create Item',
 'Edit Claim',
 'Remove Claim']

In [14]:
df.head()

Unnamed: 0,bot_name,bot_sequence,length,length_nobreak,sequence,sequence_readable,user_id,user_name,stat_dist,kmeans_2,kmeans_3,kmeans_4,kmeans_5,kmeans_6,kmeans_7,kmeans_8,kmeans_9,kmeans_10
0,False,False,1517,1385,"[REVERT_MANUAL, BREAK, CLAIM_CREATE, BREAK, CL...","[Revert Item, Break, Create Claim, Break, Crea...",1,Hoo man,"[0.033592832488441825, 0.029154644609674315, 0...",0,0,0,3,3,1,3,0,0
1,False,False,44,23,"[SITELINK_ADD, BREAK, SITELINK_ADD, BREAK, SIT...","[Add Sitelink, Break, Add Sitelink, Break, Add...",100012,Kane 14,"[0.02586206896551723, 0.025862068965517227, 0....",1,1,2,1,0,5,1,1,1
2,False,False,17,12,"[LABEL_UPDATE, ALIAS_ADD, BREAK, CLAIM_CREATE,...","[Edit Label, Add Item Alias, Break, Create Cla...",1000197,Zaizone,"[0.026205469193709592, 0.07779210211701826, 0....",1,2,1,4,1,0,5,6,2
3,False,False,91,82,"[SITELINK_UPDATE, BREAK, LABEL_ADD, CLAIM_CREA...","[Edit Sitelink, Break, Add Label, Create Claim...",1001159,Gogo hr,"[0.04118566424353361, 0.03162035177772598, 0.0...",0,0,0,3,4,6,0,7,4
4,False,False,21,12,"[DESCRIPTION_ADD, BREAK, DESCRIPTION_UPDATE, D...","[Add Description, Break, Edit Description, Add...",1001194,Phylosofo,"[0.08620689655172406, 0.025862068965517255, 0....",1,2,1,0,5,4,7,8,5


In [15]:
sep_label = cfg.get("preprocessing", "separator_label")
sep_label

'SEP'

In [None]:
for num_centroids in kernel_range:
    cluster_lbl = "kmeans_{n}".format(n=num_centroids)
    
    for cluster_id, cluster_members in tqdm(df.groupby(cluster_lbl)):
        stat_dists = []
        actions = []#np.array([])
        counts = []
        for seq, stat in cluster_members[["sequence_readable", "stat_dist"]].values:
            stat_dists.append(stat)
            actions.append(np.concatenate((seq, [sep_label])))
        
        actions = np.concatenate(actions)
        stat_dists_df = pd.DataFrame(stat_dists, columns=labels)
        stat_dist_mean = stat_dists_df.mean()
        stat_dist_std = stat_dists_df.std()
        action_df = pd.DataFrame({'from': actions})
        action_df['to'] = action_df['from'].shift(periods=-1)

        pivot = pd.crosstab(action_df['from'], action_df['to'])
        pivot_norm = pd.crosstab(action_df['from'], action_df['to'], normalize="index")
        
        for lbl_set_name, lbl_set in [("full", labels), ("top", labels_filtered)]:
            pivot_full = lib.stretch_pivot(pivot.copy(), lbl_set)
            pivot_full_norm = lib.stretch_pivot(pivot_norm.copy(), lbl_set)
            counts = pivot_full.transpose().sum().transpose()

            plot_size = (1, 1)
            if lbl_set_name == "full":
                plot_size = (50, 60)
            elif lbl_set_name == "top":
                plot_size = (20, 22)
                
            store_path = os.path.join(cfg.get("directory", "results"), results_prefix + "[kmeans]clusters[{n}_{i}][{s}]_log".format(n=num_centroids, i=cluster_id, s=lbl_set_name))
            plotting.transition_matrix(plot_size, counts, pivot_full_norm, None, cluster_id, store_path, upper_limit=1e7, logscale=True, stat_dist=(stat_dist_mean[lbl_set].values, stat_dist_std[lbl_set].values))

            store_path = os.path.join(cfg.get("directory", "results"), results_prefix + "[kmeans]clusters[{n}_{i}][{s}]".format(n=num_centroids, i=cluster_id, s=lbl_set_name))
            plotting.transition_matrix(plot_size, counts, pivot_full_norm, None, cluster_id, store_path, upper_limit=1e7, logscale=False, stat_dist=(stat_dist_mean[lbl_set].values, stat_dist_std[lbl_set].values))


HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

data/results/WIKIDATA_10[kmeans]clusters[2_0][full]_log.png
data/results/WIKIDATA_10[kmeans]clusters[2_0][full]_log.pdf
data/results/WIKIDATA_10[kmeans]clusters[2_0][full].png
data/results/WIKIDATA_10[kmeans]clusters[2_0][full].pdf
data/results/WIKIDATA_10[kmeans]clusters[2_0][top]_log.png
data/results/WIKIDATA_10[kmeans]clusters[2_0][top]_log.pdf
data/results/WIKIDATA_10[kmeans]clusters[2_0][top].png
data/results/WIKIDATA_10[kmeans]clusters[2_0][top].pdf
data/results/WIKIDATA_10[kmeans]clusters[2_1][full]_log.png
data/results/WIKIDATA_10[kmeans]clusters[2_1][full]_log.pdf
