In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
import pandas as pd

In [3]:
import lib
import plotting

In [4]:
from multiprocessing import Pool
import pickle
from collections import defaultdict

In [5]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score, calinski_harabaz_score
from sklearn.decomposition import PCA

In [22]:
from configparser import ConfigParser
cfg = ConfigParser()
cfg.read("config.cfg")

['config.cfg']

In [7]:
from tqdm._tqdm_notebook import tqdm_notebook as tqdm # we manually import the notebook submodule as the normal one struggles with jupyterlab
tqdm.pandas() # this enables us to use progress_apply instead of apply

In [8]:
results_prefix = "[WIKIDATA_NOBOT]"

sequence_file = os.path.join(cfg.get("directory", "exchange"), "[wikidata]004_sequences.p")
labels_file = os.path.join(cfg.get("directory", "exchange"), "[wikidata]004_labels.p")
users_file = os.path.join(cfg.get("directory", "exchange"), "[wikidata]004_users.p")

In [9]:
df = pd.read_pickle(sequence_file)
labels = pd.read_pickle(labels_file)['label'].values
df.head()

Unnamed: 0,bot_name,bot_sequence,length,length_nobreak,sequence,user_id,user_name
0,False,False,1529,1392,"[REVERT_MANUAL, BREAK, CLAIM_CREATE, BREAK, CL...",1,Hoo man
1,False,False,9,6,"[ENTITY_CREATE, BREAK, SITELINK_ADD, BREAK, ME...",1000036,JShenk
2,False,False,7,4,"[SITELINK_ADD, BREAK, SITELINK_ADD, BREAK, SIT...",1000078,Egor-belikov
3,False,False,9,6,"[ENTITY_CREATE, BREAK, SITELINK_REMOVE, SITELI...",100008,Wars
4,False,False,44,23,"[SITELINK_ADD, BREAK, SITELINK_ADD, BREAK, SIT...",100012,Kane 14


In [13]:
print("num_bots name", len(df.loc[df['bot_name']]))
print("len_bots name", df.loc[df['bot_name'], "length"].sum())

print("num_bots seq", len(df.loc[df['bot_sequence']]))
print("len_bots seq", df.loc[df['bot_sequence'], "length"].sum())

num_bots name 139
len_bots name 590735
num_bots seq 35
len_bots seq 60304109


In [16]:
df['is_bot'] = df['bot_name'] | df['bot_sequence']

In [17]:
df_dropped = df.loc[df['is_bot']].reset_index(drop=True)
df = df.loc[~df['is_bot']].reset_index(drop=True)

In [21]:
single_action = df.loc[df['length_nobreak'] == 1]
print(len(single_action))

42437


In [23]:
print(len(df))
df = df.loc[df['length_nobreak'] > cfg.getint("kmeans", "min_changes")].reset_index(drop=True)
print(len(df))

130742
71607


In [24]:
upper_bound = df['length'].quantile(0.98)
df.loc[df['length'] >= upper_bound].sort_values("length", ascending=False)

Unnamed: 0,bot_name,bot_sequence,length,length_nobreak,sequence,user_id,user_name,is_bot
43047,False,False,5172283,5172205,"[ENTITY_CREATE, ENTITY_CREATE, ENTITY_CREATE, ...",2912832,Artix Kreiger 2,False
6813,False,False,3025164,3024842,"[SITELINK_ADD, SITELINK_UPDATE, BREAK, SITELIN...",1518055,Sky xe,False
17302,False,False,1574249,1571616,"[SITELINK_ADD, SITELINK_ADD, SITELINK_UPDATE, ...",23475,Sporti,False
1484,False,False,1094842,1093281,"[ENTITY_CREATE, LABEL_ADD, LABEL_ADD, LABEL_AD...",1134,Romaine,False
22498,False,False,1079786,1077989,"[DESCRIPTION_UPDATE, CLAIM_CREATE, CLAIM_CREAT...",2727990,Anvilaquarius,False
63010,False,False,1053125,1052613,"[CLAIM_CREATE, SITELINK_ADD, CLAIM_CREATE, SIT...",38324,Olaf Kosinsky,False
64931,False,False,1005912,1003696,"[LABEL_ADD, LABEL_ADD, LABEL_ADD, LABEL_UPDATE...",4943,VIGNERON,False
15703,False,False,948091,948067,"[ALIAS_ADD, LABEL_ADD, ENTITY_OVERRIDE, CLAIM_...",220959,Fantasticfears,False
19634,False,False,863507,862762,"[SITELINK_ADD, SITELINK_ADD, SITELINK_ADD, SIT...",2580335,SR5,False
71319,False,False,861355,859060,"[ENTITY_CREATE, LABEL_ADD, LABEL_ADD, LABEL_AD...",9712,Benoît Prieur,False


In [25]:
#in case we didnt already do that in the parser. but it would be a waste to parse again after we found new bots
manual_bots = os.path.join(cfg.get("directory", "dataset"), cfg.get("dataset", "wikidata_aux"), "[BOTS]manual.txt")
with open(manual_bots, "r") as bot_file:
        bots = set([x.strip() for x in bot_file.readlines()])
bots

{'Artix Kreiger 2', 'Sky xe'}

In [26]:
print(len(df))
df = df.loc[~df["user_name"].isin(bots)].reset_index(drop=True)
print(len(df))

71607
71605


In [27]:
# filter top and bottom 5 percentiles

In [29]:
num_actions = df["length"].sum()
num_actions_dropped = df_dropped["length"].sum()
print("Num actions: {n}".format(n=num_actions))
print("Num actions dropped: {n}".format(n=num_actions_dropped))

Num actions: 78333501
Num actions dropped: 60894844


In [30]:
def calc_dist(sequence):
    dist, pivot = lib.calc_distribution(labels, sequence)
    return np.array(dist)

with Pool(cfg.getint("core", "num_cores")) as processor_pool:
    df['stat_dist'] = pd.Series(processor_pool.imap(calc_dist, tqdm(df['sequence'])))

HBox(children=(IntProgress(value=0, max=71605), HTML(value='')))




In [31]:
df.head()

Unnamed: 0,bot_name,bot_sequence,length,length_nobreak,sequence,user_id,user_name,is_bot,stat_dist
0,False,False,1529,1392,"[REVERT_MANUAL, BREAK, CLAIM_CREATE, BREAK, CL...",1,Hoo man,False,"[0.02231964475186247, 0.01999468254492219, 0.0..."
1,False,False,9,6,"[ENTITY_CREATE, BREAK, SITELINK_ADD, BREAK, ME...",1000036,JShenk,False,"[0.020070153072094386, 0.02007015307209438, 0...."
2,False,False,7,4,"[SITELINK_ADD, BREAK, SITELINK_ADD, BREAK, SIT...",1000078,Egor-belikov,False,"[0.01947397679104996, 0.019473976791049966, 0...."
3,False,False,9,6,"[ENTITY_CREATE, BREAK, SITELINK_REMOVE, SITELI...",100008,Wars,False,"[0.02042483183496783, 0.020424831834967825, 0...."
4,False,False,44,23,"[SITELINK_ADD, BREAK, SITELINK_ADD, BREAK, SIT...",100012,Kane 14,False,"[0.01973684210526316, 0.019736842105263157, 0...."


In [32]:
kernel_range = np.arange(cfg.getint("kmeans", "clusters_min"), cfg.getint("kmeans", "clusters_max") + 1)
kernel_range

array([ 2,  3,  4,  5,  6,  7,  8,  9, 10])

In [33]:
vectors = np.array([np.array(x) for x in df['stat_dist'].values]) # ensure we have the data in numpy format
pca = PCA(n_components=3)
plot_vectors = pca.fit_transform(vectors)
print(pca.explained_variance_ratio_.cumsum())

[0.29689487 0.46099277 0.58766319]


In [34]:
centroids = {}    
centroids_file = os.path.join(cfg.get("directory", "exchange"), results_prefix + "001_centroids.p")
load_centroids = cfg.getboolean("kmeans", "load_centroids") and os.path.isfile(centroids_file)
for num_centroids in tqdm(kernel_range):    
    kmeans = None

    print("Num Centroids: {n}".format(n=num_centroids))
    if load_centroids:
        with open(centroids_file, "rb") as dump_file:
            centroids = pickle.load(dump_file)
        kmeans = KMeans(n_clusters=num_centroids, init=centroids[num_centroids])
    else:
        kmeans = KMeans(n_clusters=num_centroids)
        
    kmeans.fit_predict(vectors)
    
    lbl = kmeans.labels_
    centroids[num_centroids] = kmeans.cluster_centers_

    
    silhouette_avg = silhouette_score(vectors, lbl)#
    print("SILHOUETTE", silhouette_avg)
    calinski_score = calinski_harabaz_score(vectors, lbl)
    print("CALINSKI", calinski_score)
    sample_silhouette_values = silhouette_samples(vectors, lbl)
    
    store_path = os.path.join(cfg.get("directory", "results"), results_prefix + "kmeans[{n}]".format(n=num_centroids))
    cluster_lbl = "kmeans_{n}".format(n=num_centroids)
    df[cluster_lbl] = lbl
    plotting.k_means(plot_vectors, num_centroids, lbl, sample_silhouette_values, silhouette_avg, store_path)
    
if cfg.getboolean("kmeans", "store_centroids"):
    with open(centroids_file, "wb") as dump_file:
        pickle.dump(centroids, dump_file)

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))

Num Centroids: 2


  return_n_iter=True)


SILHOUETTE 0.2522863929153511
CALINSKI 20304.2878004554
data/results/[WIKIDATA_NOBOT]kmeans[2]_silhouette.png
data/results/[WIKIDATA_NOBOT]kmeans[2]_silhouette.pdf
data/results/[WIKIDATA_NOBOT]kmeans[2]_clusters.png
data/results/[WIKIDATA_NOBOT]kmeans[2]_clusters.pdf
data/results/[WIKIDATA_NOBOT]kmeans[2]_clusters_planar.png
data/results/[WIKIDATA_NOBOT]kmeans[2]_clusters_planar.pdf
data/results/[WIKIDATA_NOBOT]kmeans[2]_population.png
data/results/[WIKIDATA_NOBOT]kmeans[2]_population.pdf
Num Centroids: 3


  return_n_iter=True)


SILHOUETTE 0.26854869696339745
CALINSKI 18170.364651461598
data/results/[WIKIDATA_NOBOT]kmeans[3]_silhouette.png
data/results/[WIKIDATA_NOBOT]kmeans[3]_silhouette.pdf
data/results/[WIKIDATA_NOBOT]kmeans[3]_clusters.png
data/results/[WIKIDATA_NOBOT]kmeans[3]_clusters.pdf
data/results/[WIKIDATA_NOBOT]kmeans[3]_clusters_planar.png
data/results/[WIKIDATA_NOBOT]kmeans[3]_clusters_planar.pdf
data/results/[WIKIDATA_NOBOT]kmeans[3]_population.png
data/results/[WIKIDATA_NOBOT]kmeans[3]_population.pdf
Num Centroids: 4


  return_n_iter=True)


SILHOUETTE 0.31272840656289347
CALINSKI 20483.15254468707
data/results/[WIKIDATA_NOBOT]kmeans[4]_silhouette.png
data/results/[WIKIDATA_NOBOT]kmeans[4]_silhouette.pdf
data/results/[WIKIDATA_NOBOT]kmeans[4]_clusters.png
data/results/[WIKIDATA_NOBOT]kmeans[4]_clusters.pdf
data/results/[WIKIDATA_NOBOT]kmeans[4]_clusters_planar.png
data/results/[WIKIDATA_NOBOT]kmeans[4]_clusters_planar.pdf
data/results/[WIKIDATA_NOBOT]kmeans[4]_population.png
data/results/[WIKIDATA_NOBOT]kmeans[4]_population.pdf
Num Centroids: 5


  return_n_iter=True)


SILHOUETTE 0.3392383083880852
CALINSKI 20559.535375746127
data/results/[WIKIDATA_NOBOT]kmeans[5]_silhouette.png
data/results/[WIKIDATA_NOBOT]kmeans[5]_silhouette.pdf
data/results/[WIKIDATA_NOBOT]kmeans[5]_clusters.png
data/results/[WIKIDATA_NOBOT]kmeans[5]_clusters.pdf
data/results/[WIKIDATA_NOBOT]kmeans[5]_clusters_planar.png
data/results/[WIKIDATA_NOBOT]kmeans[5]_clusters_planar.pdf
data/results/[WIKIDATA_NOBOT]kmeans[5]_population.png
data/results/[WIKIDATA_NOBOT]kmeans[5]_population.pdf
Num Centroids: 6


  return_n_iter=True)


SILHOUETTE 0.353501497521209
CALINSKI 19649.451429386274
data/results/[WIKIDATA_NOBOT]kmeans[6]_silhouette.png
data/results/[WIKIDATA_NOBOT]kmeans[6]_silhouette.pdf
data/results/[WIKIDATA_NOBOT]kmeans[6]_clusters.png
data/results/[WIKIDATA_NOBOT]kmeans[6]_clusters.pdf
data/results/[WIKIDATA_NOBOT]kmeans[6]_clusters_planar.png
data/results/[WIKIDATA_NOBOT]kmeans[6]_clusters_planar.pdf
data/results/[WIKIDATA_NOBOT]kmeans[6]_population.png
data/results/[WIKIDATA_NOBOT]kmeans[6]_population.pdf
Num Centroids: 7


  return_n_iter=True)


SILHOUETTE 0.3559297057711048
CALINSKI 19547.371719608604
data/results/[WIKIDATA_NOBOT]kmeans[7]_silhouette.png
data/results/[WIKIDATA_NOBOT]kmeans[7]_silhouette.pdf
data/results/[WIKIDATA_NOBOT]kmeans[7]_clusters.png
data/results/[WIKIDATA_NOBOT]kmeans[7]_clusters.pdf
data/results/[WIKIDATA_NOBOT]kmeans[7]_clusters_planar.png
data/results/[WIKIDATA_NOBOT]kmeans[7]_clusters_planar.pdf
data/results/[WIKIDATA_NOBOT]kmeans[7]_population.png
data/results/[WIKIDATA_NOBOT]kmeans[7]_population.pdf
Num Centroids: 8


  return_n_iter=True)


SILHOUETTE 0.36213452426919207
CALINSKI 18573.31047873947
data/results/[WIKIDATA_NOBOT]kmeans[8]_silhouette.png
data/results/[WIKIDATA_NOBOT]kmeans[8]_silhouette.pdf
data/results/[WIKIDATA_NOBOT]kmeans[8]_clusters.png
data/results/[WIKIDATA_NOBOT]kmeans[8]_clusters.pdf
data/results/[WIKIDATA_NOBOT]kmeans[8]_clusters_planar.png
data/results/[WIKIDATA_NOBOT]kmeans[8]_clusters_planar.pdf
data/results/[WIKIDATA_NOBOT]kmeans[8]_population.png
data/results/[WIKIDATA_NOBOT]kmeans[8]_population.pdf
Num Centroids: 9


  return_n_iter=True)


SILHOUETTE 0.31876715908836595
CALINSKI 18340.252075103064
data/results/[WIKIDATA_NOBOT]kmeans[9]_silhouette.png
data/results/[WIKIDATA_NOBOT]kmeans[9]_silhouette.pdf
data/results/[WIKIDATA_NOBOT]kmeans[9]_clusters.png
data/results/[WIKIDATA_NOBOT]kmeans[9]_clusters.pdf
data/results/[WIKIDATA_NOBOT]kmeans[9]_clusters_planar.png
data/results/[WIKIDATA_NOBOT]kmeans[9]_clusters_planar.pdf
data/results/[WIKIDATA_NOBOT]kmeans[9]_population.png
data/results/[WIKIDATA_NOBOT]kmeans[9]_population.pdf
Num Centroids: 10


  return_n_iter=True)


SILHOUETTE 0.33059131450815116
CALINSKI 18010.059175859664
data/results/[WIKIDATA_NOBOT]kmeans[10]_silhouette.png
data/results/[WIKIDATA_NOBOT]kmeans[10]_silhouette.pdf
data/results/[WIKIDATA_NOBOT]kmeans[10]_clusters.png
data/results/[WIKIDATA_NOBOT]kmeans[10]_clusters.pdf
data/results/[WIKIDATA_NOBOT]kmeans[10]_clusters_planar.png
data/results/[WIKIDATA_NOBOT]kmeans[10]_clusters_planar.pdf
data/results/[WIKIDATA_NOBOT]kmeans[10]_population.png
data/results/[WIKIDATA_NOBOT]kmeans[10]_population.pdf



In [35]:
df.to_pickle(os.path.join(cfg.get("directory", "exchange"), results_prefix + "001_clustering.p"))

In [36]:
#load here to skip previous calculations if you already have em
df = pd.read_pickle(os.path.join(cfg.get("directory", "exchange"), results_prefix + "001_clustering.p"))

In [37]:
labels_filtered = sorted(["BREAK", "DESCRIPTION_UPDATE", "CLAIM_CREATE", "REFERENCE_ADD", "ENTITY_OVERRIDE", "ENTITY_REDIRECT", "MERGE", "DESCRIPTION_ADD", "LABEL_ADD", "QUALIFIER_ADD", "CLAIM_UPDATE", "SITELINK_ADD", "SITELINK_UPDATE", ])
labels_filtered

['BREAK',
 'CLAIM_CREATE',
 'CLAIM_UPDATE',
 'DESCRIPTION_ADD',
 'DESCRIPTION_UPDATE',
 'ENTITY_OVERRIDE',
 'ENTITY_REDIRECT',
 'LABEL_ADD',
 'MERGE',
 'QUALIFIER_ADD',
 'REFERENCE_ADD',
 'SITELINK_ADD',
 'SITELINK_UPDATE']

In [42]:
for num_centroids in kernel_range:
    member_count = defaultdict(int)
    cluster = defaultdict(list)
    cluster_lbl = "kmeans_{n}".format(n=num_centroids)
    
    for i, user in df.iterrows():
        cluster_id = user[cluster_lbl]
        member_count[cluster_id] += 1
        action_df = pd.DataFrame({"from": user['sequence']})
        action_df["to"] = action_df["from"].shift(periods=-1)
        cluster[cluster_id].append(action_df)
         
    for cluster_index in sorted(cluster):
        cluster_df = pd.concat(cluster[cluster_index])

        transition_count_pivot = lib.stretch_pivot(pd.crosstab(cluster_df['from'], cluster_df['to']), labels)
        unnormalized = transition_count_pivot.transpose().sum().transpose()
        pivot = lib.stretch_pivot(pd.crosstab(cluster_df['from'], cluster_df['to'], normalize="index"), labels)
        
        store_path = os.path.join(cfg.get("directory", "results"), results_prefix + "clusters[{n}_{i}]".format(n=num_centroids, i=cluster_index))
        plotting.transition_matrix((50, 60), unnormalized, pivot, transition_count_pivot, cluster_index, store_path, upper_limit=1e7, logscale=True)
        
        
        transition_count_pivot_reduced = lib.stretch_pivot(transition_count_pivot, labels_filtered)
        unnormalized_reduced = transition_count_pivot_reduced.transpose().sum().transpose()
        pivot_reduced = lib.stretch_pivot(pd.crosstab(cluster_df['from'], cluster_df['to'], normalize="index"), labels_filtered)
        
        store_path = os.path.join(cfg.get("directory", "results"), results_prefix + "clusters[{n}_{i}]reduced".format(n=num_centroids, i=cluster_index))
        plotting.transition_matrix((20, 25),unnormalized_reduced, pivot_reduced, transition_count_pivot_reduced, cluster_index, store_path, upper_limit=1e7, logscale=True)
print("Done...")

TypeError: transition_matrix() got an unexpected keyword argument 'upper_limit'