In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
import pandas as pd

In [3]:
import lib
import plotting
import markov_chain as mc
import markov_tools as mt

In [4]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd, MultiComparison

In [5]:
from configparser import ConfigParser
cfg = ConfigParser()
cfg.read("config.cfg")

['config.cfg']

In [6]:
from tqdm._tqdm_notebook import tqdm_notebook as tqdm # we manually import the notebook submodule as the normal one struggles with jupyterlab
tqdm.pandas() # this enables us to use progress_apply instead of apply

In [7]:
results_prefix = "[WIKIDATA_PAPER]"

sequence_file = os.path.join(cfg.get("directory", "exchange"), results_prefix + "001_clustering.p")
labels_file = os.path.join(cfg.get("directory", "exchange"), "[wikidata]004_labels_readable.p")

In [8]:
df = pd.read_pickle(sequence_file)
df.head()

Unnamed: 0,bot_name,bot_sequence,length,length_nobreak,sequence,sequence_readable,user_id,user_name,stat_dist,kmeans_2,kmeans_3,kmeans_4,kmeans_5,kmeans_6,kmeans_7,kmeans_8,kmeans_9,kmeans_10
0,False,False,1517,1385,"[REVERT_MANUAL, BREAK, CLAIM_CREATE, BREAK, CL...","[Revert Item, Break, Create Claim, Break, Crea...",1,Hoo man,"[0.033592832488441825, 0.029154644609674315, 0...",0,2,1,2,0,4,6,2,2
1,False,False,9,6,"[ENTITY_CREATE, BREAK, SITELINK_ADD, BREAK, ME...","[Create Item, Break, Add Sitelink, Break, Merg...",1000036,JShenk,"[0.02643738030717696, 0.02643738030717696, 0.0...",1,0,2,4,3,3,1,1,1
2,False,False,7,4,"[SITELINK_ADD, BREAK, SITELINK_ADD, BREAK, SIT...","[Add Sitelink, Break, Add Sitelink, Break, Add...",1000078,Egor-belikov,"[0.025412586144359806, 0.025412586144359806, 0...",1,1,0,3,1,2,4,4,4
3,False,False,9,6,"[ENTITY_CREATE, BREAK, SITELINK_REMOVE, SITELI...","[Create Item, Break, Remove Sitelink, Add Site...",100008,Wars,"[0.027056268692473535, 0.02705626869247354, 0....",1,1,0,3,1,2,4,1,1
4,False,False,44,23,"[SITELINK_ADD, BREAK, SITELINK_ADD, BREAK, SIT...","[Add Sitelink, Break, Add Sitelink, Break, Add...",100012,Kane 14,"[0.025862068965517248, 0.025862068965517244, 0...",1,1,0,3,1,2,4,4,4


In [9]:
labels = pd.read_pickle(labels_file)['label'].values

We test only one clustering (numer of clusters) at a time.  
For each label/action we conduct a pairwise tukey's test to highlight the actions that show significant difference.

Note: as this is a pairwise comparison for each single label this generates a lot of output!

In [10]:
len(df.iloc[0]['stat_dist'])

32

In [11]:
len(labels)

32

In [12]:
num_clusters = 4
df_selector = "kmeans_{n}".format(n=num_clusters)

In [13]:
sig_labels = []
for i, l in enumerate(labels):
    vals = df['stat_dist'].apply(lambda x: x[i])
    lbls = df[df_selector]
    comp_object = MultiComparison(vals, lbls)
    hsd = comp_object.tukeyhsd(alpha=0.01)
    print("{l}: {s} / 6".format(l=l, s=sum(hsd.reject)))
    #print(hsd.summary())
    
    if sum(hsd.reject) == 6:
        sig_labels.append(l)

Add Description: 5 / 6
Add Item Alias: 5 / 6
Add Label: 6 / 6
Add Qualifier: 5 / 6
Add Reference: 5 / 6
Add Sitelink: 6 / 6
Break: 6 / 6
Create Claim: 5 / 6
Create Item: 6 / 6
Create Property: 5 / 6
Edit Alias: 5 / 6
Edit Claim: 5 / 6
Edit Claim Value: 6 / 6
Edit Description: 5 / 6
Edit Item: 5 / 6
Edit Label: 6 / 6
Edit Qualifier: 5 / 6
Edit Reference: 5 / 6
Edit Sitelink: 6 / 6
Merge Items: 5 / 6
Override Item: 5 / 6
Protect Item: 5 / 6
Redirect Item: 5 / 6
Remove Alias: 5 / 6
Remove Claim: 5 / 6
Remove Description: 5 / 6
Remove Item: 5 / 6
Remove Label: 5 / 6
Remove Qualifier: 5 / 6
Remove Reference: 5 / 6
Remove Sitelink: 6 / 6
Revert Item: 5 / 6


In [14]:
sig_labels

['Add Label',
 'Add Sitelink',
 'Break',
 'Create Item',
 'Edit Claim Value',
 'Edit Label',
 'Edit Sitelink',
 'Remove Sitelink']

### TOP Labels

In [15]:
cluster_grp = df.groupby(df_selector)

In [16]:
from collections import defaultdict

In [32]:
full_seq = np.array([])
for c_id, grp in cluster_grp:
    seq = np.concatenate(grp['sequence_readable'].values)
    
    full_seq = np.concatenate((full_seq, seq))
full_seq = pd.Series(full_seq)

In [37]:
counts = pd.Series(full_seq).value_counts()

In [39]:
sorted(list(counts.head(10).index))

['Add Description',
 'Add Label',
 'Add Qualifier',
 'Add Reference',
 'Add Sitelink',
 'Break',
 'Create Claim',
 'Create Item',
 'Edit Claim',
 'Remove Claim']