In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
import pandas as pd

In [3]:
import lib
import plotting

In [4]:
from multiprocessing import Pool
import pickle
from collections import defaultdict

In [5]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score, calinski_harabaz_score
from sklearn.decomposition import PCA

In [6]:
from configparser import ConfigParser
cfg = ConfigParser()
cfg.read("config.cfg")

['config.cfg']

In [7]:
from tqdm._tqdm_notebook import tqdm_notebook as tqdm # we manually import the notebook submodule as the normal one struggles with jupyterlab
tqdm.pandas() # this enables us to use progress_apply instead of apply

In [8]:
results_prefix = "[WIKIDATA_PAPER]"

sequence_file = os.path.join(cfg.get("directory", "exchange"), "[wikidata]004_sequences.p")
labels_file = os.path.join(cfg.get("directory", "exchange"), "[wikidata]004_labels_readable.p")
users_file = os.path.join(cfg.get("directory", "exchange"), "[wikidata]004_users.p")

In [9]:
df = pd.read_pickle(sequence_file)
labels = pd.read_pickle(labels_file)['label'].values
df.head()

Unnamed: 0,bot_name,bot_sequence,length,length_nobreak,sequence,sequence_readable,user_id,user_name
0,False,False,1517,1385,"[REVERT_MANUAL, BREAK, CLAIM_CREATE, BREAK, CL...","[Revert Item, Break, Create Claim, Break, Crea...",1,Hoo man
1,False,False,9,6,"[ENTITY_CREATE, BREAK, SITELINK_ADD, BREAK, ME...","[Create Item, Break, Add Sitelink, Break, Merg...",1000036,JShenk
2,False,False,7,4,"[SITELINK_ADD, BREAK, SITELINK_ADD, BREAK, SIT...","[Add Sitelink, Break, Add Sitelink, Break, Add...",1000078,Egor-belikov
3,False,False,9,6,"[ENTITY_CREATE, BREAK, SITELINK_REMOVE, SITELI...","[Create Item, Break, Remove Sitelink, Add Site...",100008,Wars
4,False,False,44,23,"[SITELINK_ADD, BREAK, SITELINK_ADD, BREAK, SIT...","[Add Sitelink, Break, Add Sitelink, Break, Add...",100012,Kane 14


In [10]:
labels

array(['Add Description', 'Add Item Alias', 'Add Label', 'Add Qualifier',
       'Add Reference', 'Add Sitelink', 'Break', 'Create Claim',
       'Create Item', 'Create Property', 'Edit Alias', 'Edit Claim',
       'Edit Claim Value', 'Edit Description', 'Edit Item', 'Edit Label',
       'Edit Qualifier', 'Edit Reference', 'Edit Sitelink', 'Merge Items',
       'Override Item', 'Protect Item', 'Redirect Item', 'Remove Alias',
       'Remove Claim', 'Remove Description', 'Remove Item',
       'Remove Label', 'Remove Qualifier', 'Remove Reference',
       'Remove Sitelink', 'Revert Item'], dtype=object)

In [12]:
def calc_dist(sequence):
    dist, pivot = lib.calc_distribution(labels_temp_backup, sequence)
    return np.array(dist)

with Pool(cfg.getint("core", "num_cores")) as processor_pool:
    df['stat_dist'] = processor_pool.map(calc_dist, tqdm(df['sequence_readable']))

HBox(children=(IntProgress(value=0, max=88148), HTML(value='')))




In [66]:
df.tail()

Unnamed: 0,bot_name,bot_sequence,length,length_nobreak,sequence,sequence_readable,user_id,user_name,stat_dist
88147,False,False,3,2,"[SITELINK_ADD, BREAK, SITELINK_ADD]","[Add Sitelink, Break, Add Sitelink]",999457,Մելքոնյան Սյուզաննա,"[0.025862068965517224, 0.025862068965517227, 0..."
88148,False,False,7,4,"[SITELINK_ADD, BREAK, SITELINK_ADD, BREAK, SIT...","[Add Sitelink, Break, Add Sitelink, Break, Edi...",9999,Cjpark94,"[0.026256161551916037, 0.02625616155191603, 0...."
88149,False,False,85,50,"[SITELINK_ADD, BREAK, SITELINK_ADD, BREAK, SIT...","[Add Sitelink, Break, Add Sitelink, Break, Add...",99995,Gyimhu,"[0.02791461412151068, 0.02586206896551726, 0.0..."
88150,False,False,30,25,"[DESCRIPTION_ADD, DESCRIPTION_ADD, DESCRIPTION...","[Add Description, Add Description, Add Descrip...",999951,Mongrangvebet,"[0.14476813317479198, 0.025862068965517238, 0...."
88151,False,False,24,14,"[SITELINK_UPDATE, BREAK, SITELINK_UPDATE, BREA...","[Edit Sitelink, Break, Edit Sitelink, Break, E...",999994,Alena Pokorná,"[0.025961231375193086, 0.025961231375193093, 0..."


In [67]:
df.loc[pd.isna(df['stat_dist'])]

Unnamed: 0,bot_name,bot_sequence,length,length_nobreak,sequence,sequence_readable,user_id,user_name,stat_dist


In [11]:
kernel_range = np.arange(cfg.getint("kmeans", "clusters_min"), cfg.getint("kmeans", "clusters_max") + 1)
kernel_range

array([ 2,  3,  4,  5,  6,  7,  8,  9, 10])

In [14]:
vectors = np.array([np.array(x) for x in df['stat_dist'].values]) # ensure we have the data in numpy format
pca = PCA(n_components=3)
plot_vectors = pca.fit_transform(vectors)
print(pca.explained_variance_ratio_.cumsum())

[0.28279355 0.4506194  0.57313887]


In [15]:
centroids = {}    
centroids_file = os.path.join(cfg.get("directory", "exchange"), results_prefix + "001_centroids.p")
load_centroids = cfg.getboolean("kmeans", "load_centroids") and os.path.isfile(centroids_file)
for num_centroids in tqdm(kernel_range):    
    kmeans = None

    print("Num Centroids: {n}".format(n=num_centroids))
    if load_centroids:
        with open(centroids_file, "rb") as dump_file:
            centroids = pickle.load(dump_file)
        kmeans = KMeans(n_clusters=num_centroids, init=centroids[num_centroids])
    else:
        kmeans = KMeans(n_clusters=num_centroids, random_state=cfg.getint("kmeans", "random_state"))
        
    kmeans.fit_predict(vectors)
    
    lbl = kmeans.labels_
    centroids[num_centroids] = kmeans.cluster_centers_

    
    silhouette_avg = silhouette_score(vectors, lbl)#
    print("SILHOUETTE", silhouette_avg)
    calinski_score = calinski_harabaz_score(vectors, lbl)
    print("CALINSKI", calinski_score)
    sample_silhouette_values = silhouette_samples(vectors, lbl)
    
    store_path = os.path.join(cfg.get("directory", "results"), results_prefix + "kmeans[{n}]".format(n=num_centroids))
    cluster_lbl = "kmeans_{n}".format(n=num_centroids)
    df[cluster_lbl] = lbl
    plotting.k_means(plot_vectors, num_centroids, lbl, sample_silhouette_values, silhouette_avg, store_path)
    
if cfg.getboolean("kmeans", "store_centroids"):
    with open(centroids_file, "wb") as dump_file:
        pickle.dump(centroids, dump_file)

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))

Num Centroids: 2
SILHOUETTE 0.24431148554008514
CALINSKI 23646.722495738464
data/results/[WIKIDATA_PAPER]kmeans[2]_silhouette.png
data/results/[WIKIDATA_PAPER]kmeans[2]_silhouette.pdf
data/results/[WIKIDATA_PAPER]kmeans[2]_clusters.png
data/results/[WIKIDATA_PAPER]kmeans[2]_clusters.pdf
data/results/[WIKIDATA_PAPER]kmeans[2]_clusters_planar.png
data/results/[WIKIDATA_PAPER]kmeans[2]_clusters_planar.pdf
data/results/[WIKIDATA_PAPER]kmeans[2]_population.png
data/results/[WIKIDATA_PAPER]kmeans[2]_population.pdf
Num Centroids: 3
SILHOUETTE 0.2627677983412566
CALINSKI 22754.753688287306
data/results/[WIKIDATA_PAPER]kmeans[3]_silhouette.png
data/results/[WIKIDATA_PAPER]kmeans[3]_silhouette.pdf
data/results/[WIKIDATA_PAPER]kmeans[3]_clusters.png
data/results/[WIKIDATA_PAPER]kmeans[3]_clusters.pdf
data/results/[WIKIDATA_PAPER]kmeans[3]_clusters_planar.png
data/results/[WIKIDATA_PAPER]kmeans[3]_clusters_planar.pdf
data/results/[WIKIDATA_PAPER]kmeans[3]_population.png
data/results/[WIKIDATA_PAPE

In [16]:
df.to_pickle(os.path.join(cfg.get("directory", "exchange"), results_prefix + "001_clustering.p"))

In [12]:
#load here to skip previous calculations if you already have em
df = pd.read_pickle(os.path.join(cfg.get("directory", "exchange"), results_prefix + "001_clustering.p"))

In [13]:
labels_filtered = labels#sorted(["BREAK", "DESCRIPTION_UPDATE", "CLAIM_CREATE", "REFERENCE_ADD", "ENTITY_OVERRIDE", "ENTITY_REDIRECT", "MERGE", "DESCRIPTION_ADD", "LABEL_ADD", "QUALIFIER_ADD", "CLAIM_UPDATE", "SITELINK_ADD", "SITELINK_UPDATE", ])
labels_filtered

array(['Add Description', 'Add Item Alias', 'Add Label', 'Add Qualifier',
       'Add Reference', 'Add Sitelink', 'Break', 'Create Claim',
       'Create Item', 'Create Property', 'Edit Alias', 'Edit Claim',
       'Edit Claim Value', 'Edit Description', 'Edit Item', 'Edit Label',
       'Edit Qualifier', 'Edit Reference', 'Edit Sitelink', 'Merge Items',
       'Override Item', 'Protect Item', 'Redirect Item', 'Remove Alias',
       'Remove Claim', 'Remove Description', 'Remove Item',
       'Remove Label', 'Remove Qualifier', 'Remove Reference',
       'Remove Sitelink', 'Revert Item'], dtype=object)

In [14]:
sep_label = cfg.get("preprocessing", "separator_label")
sep_label

'SEP'

In [15]:
df.head()

Unnamed: 0,bot_name,bot_sequence,length,length_nobreak,sequence,sequence_readable,user_id,user_name,stat_dist,kmeans_2,kmeans_3,kmeans_4,kmeans_5,kmeans_6,kmeans_7,kmeans_8,kmeans_9,kmeans_10
0,False,False,1517,1385,"[REVERT_MANUAL, BREAK, CLAIM_CREATE, BREAK, CL...","[Revert Item, Break, Create Claim, Break, Crea...",1,Hoo man,"[0.033592832488441825, 0.029154644609674315, 0...",0,2,1,2,0,4,6,2,2
1,False,False,9,6,"[ENTITY_CREATE, BREAK, SITELINK_ADD, BREAK, ME...","[Create Item, Break, Add Sitelink, Break, Merg...",1000036,JShenk,"[0.02643738030717696, 0.02643738030717696, 0.0...",1,0,2,4,3,3,1,1,1
2,False,False,7,4,"[SITELINK_ADD, BREAK, SITELINK_ADD, BREAK, SIT...","[Add Sitelink, Break, Add Sitelink, Break, Add...",1000078,Egor-belikov,"[0.025412586144359806, 0.025412586144359806, 0...",1,1,0,3,1,2,4,4,4
3,False,False,9,6,"[ENTITY_CREATE, BREAK, SITELINK_REMOVE, SITELI...","[Create Item, Break, Remove Sitelink, Add Site...",100008,Wars,"[0.027056268692473535, 0.02705626869247354, 0....",1,1,0,3,1,2,4,1,1
4,False,False,44,23,"[SITELINK_ADD, BREAK, SITELINK_ADD, BREAK, SIT...","[Add Sitelink, Break, Add Sitelink, Break, Add...",100012,Kane 14,"[0.025862068965517248, 0.025862068965517244, 0...",1,1,0,3,1,2,4,4,4


In [21]:
for num_centroids in kernel_range:
    member_count = defaultdict(int)
    cluster = defaultdict(list)
    cluster_lbl = "kmeans_{n}".format(n=num_centroids)
    
    for i, user in df.iterrows():
        cluster_id = user[cluster_lbl]
        member_count[cluster_id] += 1
        action_df = pd.DataFrame({"from": user['sequence_readable']})
        action_df["to"] = action_df["from"].shift(periods=-1)
        cluster[cluster_id].append(action_df)
    for cluster_index in sorted(cluster):
        cluster_df = pd.concat(cluster[cluster_index])

        transition_count_pivot = lib.stretch_pivot(pd.crosstab(cluster_df['from'], cluster_df['to']), labels)
        unnormalized = transition_count_pivot.transpose().sum().transpose()
        pivot = lib.stretch_pivot(pd.crosstab(cluster_df['from'], cluster_df['to'], normalize="index"), labels)
        
        store_path = os.path.join(cfg.get("directory", "results"), results_prefix + "clusters[{n}_{i}]".format(n=num_centroids, i=cluster_index))
        plotting.transition_matrix((50, 60), unnormalized, pivot, transition_count_pivot, cluster_index, store_path, upper_limit=1e7, logscale=True)
        
        
        transition_count_pivot_reduced = lib.stretch_pivot(transition_count_pivot, labels_filtered)
        unnormalized_reduced = transition_count_pivot_reduced.transpose().sum().transpose()
        pivot_reduced = lib.stretch_pivot(pd.crosstab(cluster_df['from'], cluster_df['to'], normalize="index"), labels_filtered)
        
        store_path = os.path.join(cfg.get("directory", "results"), results_prefix + "clusters[{n}_{i}]reduced".format(n=num_centroids, i=cluster_index))
        plotting.transition_matrix((20, 25),unnormalized_reduced, pivot_reduced, transition_count_pivot_reduced, cluster_index, store_path, upper_limit=1e7, logscale=True)
print("Done...")

data/results/[WIKIDATA_PAPER]clusters[2_0].png
data/results/[WIKIDATA_PAPER]clusters[2_0].pdf
data/results/[WIKIDATA_PAPER]clusters[2_0]reduced.png
data/results/[WIKIDATA_PAPER]clusters[2_0]reduced.pdf
data/results/[WIKIDATA_PAPER]clusters[2_1].png
data/results/[WIKIDATA_PAPER]clusters[2_1].pdf
data/results/[WIKIDATA_PAPER]clusters[2_1]reduced.png
data/results/[WIKIDATA_PAPER]clusters[2_1]reduced.pdf
data/results/[WIKIDATA_PAPER]clusters[3_0].png
data/results/[WIKIDATA_PAPER]clusters[3_0].pdf
data/results/[WIKIDATA_PAPER]clusters[3_0]reduced.png
data/results/[WIKIDATA_PAPER]clusters[3_0]reduced.pdf
data/results/[WIKIDATA_PAPER]clusters[3_1].png
data/results/[WIKIDATA_PAPER]clusters[3_1].pdf
data/results/[WIKIDATA_PAPER]clusters[3_1]reduced.png
data/results/[WIKIDATA_PAPER]clusters[3_1]reduced.pdf
data/results/[WIKIDATA_PAPER]clusters[3_2].png
data/results/[WIKIDATA_PAPER]clusters[3_2].pdf
data/results/[WIKIDATA_PAPER]clusters[3_2]reduced.png
data/results/[WIKIDATA_PAPER]clusters[3_2]re

In [None]:
"done..."