In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd

In [3]:
from multiprocessing.pool import ThreadPool
from concurrent.futures import ThreadPoolExecutor

In [28]:
from configparser import ConfigParser
cfg = ConfigParser()
cfg.read("config.cfg")

['config.cfg']

In [5]:
from tqdm._tqdm_notebook import tqdm_notebook as tqdm # we manually import the notebook submodule as the normal one struggles with jupyterlab
tqdm.pandas() # this enables us to use progress_apply instead of apply

In [6]:
import experimental

In [7]:
df_revision = pd.read_pickle(os.path.join(cfg.get("directory", "exchange"), "[wikidata]003_assign_labels.p"))
df_revision.head()

Unnamed: 0,comment,dataset_file,model,note,page_id,page_ns,page_title,timestamp,user_id,user_ip,user_name,action_base,action_digit,action_language,label,label_readable
0,/* wbsetdescription-add:1|ar */ مؤرخ ألماني,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101383,0,Q98773,2017-06-23 19:37:19+00:00,2437879,,Benseid Seid,wbsetdescription-add,1,ar,DESCRIPTION_ADD,Add Description
1,"/* wbsetlabel-add:1|sl */ Gerhard Raff, #quick...",wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101383,0,Q98773,2018-01-29 13:38:28+00:00,23475,,Sporti,wbsetlabel-add,1,sl,LABEL_ADD,Add Label
2,/* wbsetsitelink-add:1|enwiki */ Wolfram Wette,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101384,0,Q98774,2017-01-04 17:08:50+00:00,17848,,Assayer,wbsetsitelink-add,1,enwiki,SITELINK_ADD,Add Sitelink
3,/* wbsetreference-add:2| */ [[Property:P937]]:...,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101384,0,Q98774,2017-06-06 08:21:58+00:00,38324,,Olaf Kosinsky,wbsetreference-add,2,,REFERENCE_ADD,Add Reference
4,/* wbcreateclaim-create:1| */ [[Property:P1412...,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101384,0,Q98774,2017-06-06 14:59:40+00:00,38324,,Olaf Kosinsky,wbcreateclaim-create,1,,CLAIM_CREATE,Create Claim


In [9]:
counts = df_revision['user_id'].value_counts()

In [10]:
counts.head()

2883061    11910499
133116     10056842
2912832     5172205
1822        3818472
78009       3522006
Name: user_id, dtype: int64

In [11]:
df_revision['user_name'].value_counts().head()

Renamerr           11910499
Harej              10056842
Artix Kreiger 2     5172205
Nikosguard          3818472
Ghuron              3522006
Name: user_name, dtype: int64

In [12]:
grouped_users = df_revision.groupby("user_id")

In [13]:
break_label = cfg.get("preprocessing", "break_label")
break_delta = pd.Timedelta(cfg.get("preprocessing", "break_min_delta"))

In [22]:
# use this implementation for singlecore
users = []
with tqdm(desc="Inner Progress") as inner_progress: # a small hack to have an inner progress bar that resets every iteration
    for user_id, revisions in tqdm(grouped_users, desc="Total Progress"):
        revisions_sorted = revisions.sort_values("timestamp")
        user_name = revisions_sorted['user_name'].iloc[0]
        sequence = []
        sequence_nobreak = []
        last_ts = None
        len_rev = len(revisions_sorted)
        inner_progress.total = len_rev
        for i, (timestamp, label) in enumerate(revisions_sorted[["timestamp", "label"]].values): # does the same as iterrows below but much much faster 
        # for i, (_, r) in enumerate(revisions_sorted.iterrows()):
            # timestamp = r['timestamp']
            # label = r['label']
            inner_progress.update()
            if i > 0:
                if timestamp > last_ts + break_delta:
                    sequence.append(break_label)
            sequence.append(label)
            sequence_nobreak.append(label)
            last_ts = timestamp

        bot_sequence = experimental.detect_bot(revisions_sorted)
        bot_name = user_name.lower().endswith("bot") or user_name.lower().endswith("bot*")
        user_package = {
            "user_id": user_id, 
            "user_name": user_name, 
            "length": len(sequence), 
            "length_nobreak": len(sequence_nobreak),
            "sequence": sequence, 
            "bot_sequence": bot_sequence,
            "bot_name": bot_name
        }
        users.append(user_package)
        inner_progress.n = 0
        inner_progress.last_print_n = 0

HBox(children=(IntProgress(value=1, bar_style='info', description='Inner Progress', max=1, style=ProgressStyle…

HBox(children=(IntProgress(value=0, description='Total Progress', max=88298, style=ProgressStyle(description_w…

In [15]:
print("done sequencing....")

done sequencing....


In [23]:
df_user = pd.DataFrame(users)

In [24]:
df_user.head()

Unnamed: 0,bot_name,bot_sequence,length,length_nobreak,sequence,user_id,user_name
0,False,False,1517,1385,"[REVERT_MANUAL, BREAK, CLAIM_CREATE, BREAK, CL...",1,Hoo man
1,False,False,9,6,"[ENTITY_CREATE, BREAK, SITELINK_ADD, BREAK, ME...",1000036,JShenk
2,False,False,7,4,"[SITELINK_ADD, BREAK, SITELINK_ADD, BREAK, SIT...",1000078,Egor-belikov
3,False,False,9,6,"[ENTITY_CREATE, BREAK, SITELINK_REMOVE, SITELI...",100008,Wars
4,False,False,44,23,"[SITELINK_ADD, BREAK, SITELINK_ADD, BREAK, SIT...",100012,Kane 14


### Drop detected bots

In [25]:
print("Num Bots via name:  {n}".format(n=sum(df_user['bot_name'])))
print("Num Bots via sequence:  {n}".format(n=sum(df_user['bot_sequence'])))
bot_selector = (df_user['bot_name']) | (df_user['bot_sequence'])
print("Num affected actions: {n}".format(n=df_user.loc[bot_selector, "length_nobreak"].sum()))

Num Bots via name:  109
Num Bots via sequence:  37
Num affected actions: 61887872


In [26]:
print("Length before: {l}".format(l=len(df_user)))
df_user = df_user.loc[~bot_selector].reset_index(drop=True) # actually remove em from the df
print("Length before: {l}".format(l=len(df_user)))

Length before: 88298
Length before: 88152


In [None]:
#list the most active users
display(df_user.sort_values("length_nobreak", ascending=False).head())

### Remove manual bots

In [None]:
bot_file = os.path.join(cfg.get("directory", "dataset"), cfg.get("dataset", "wikidata_aux"), "[BOTS]manual.txt")
print(bot_file)

### Store data

In [14]:
df_user.to_pickle(os.path.join(cfg.get("directory", "exchange"), "[wikidata]004_sequences.p"))

In [29]:
all_labels = set(df_revision['label'].unique())
all_labels.add(break_label)
all_labels = sorted(all_labels)
all_labels_readable = sorted(set(df_revision['label_readable'].unique()))
all_labels_readable.add(cfg.get("preprocessing", "break_label_readable"))
df_label = pd.DataFrame({"label": all_labels, "label_readable": all_labels_readable})
# df_label

SyntaxError: invalid syntax (<ipython-input-29-54922b201ce4>, line 6)

In [16]:
df_label.to_pickle(os.path.join(cfg.get("directory", "exchange"), "[wikidata]004_labels.p"))

In [17]:
"complete..."

'complete...'