In [1]:
import os
import pandas as pd

In [2]:
import re

In [3]:
from configparser import ConfigParser
cfg = ConfigParser()
cfg.read("config.cfg")

['config.cfg']

In [4]:
from tqdm._tqdm_notebook import tqdm_notebook as tqdm # we manually import the notebook submodule as the normal one struggles with jupyterlab
tqdm.pandas() # this enables us to use progress_apply instead of apply

### Load previous data

In [5]:
df = pd.read_pickle(os.path.join(cfg.get("directory", "exchange"), "[wikidata]001_dataframe_full.p"))

In [6]:
df.head()

Unnamed: 0,comment,dataset_file,model,note,page_id,page_ns,page_title,timestamp,user_id,user_ip,user_name
0,/* wbsetdescription-add:1|ar */ مؤرخ ألماني,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101383,0,Q98773,2017-06-23 19:37:19+00:00,2437879,,Benseid Seid
1,"/* wbsetlabel-add:1|sl */ Gerhard Raff, #quick...",wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101383,0,Q98773,2018-01-29 13:38:28+00:00,23475,,Sporti
2,/* wbsetsitelink-add:1|enwiki */ Wolfram Wette,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101384,0,Q98774,2017-01-04 17:08:50+00:00,17848,,Assayer
3,/* wbsetreference-add:2| */ [[Property:P937]]:...,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101384,0,Q98774,2017-06-06 08:21:58+00:00,38324,,Olaf Kosinsky
4,/* wbcreateclaim-create:1| */ [[Property:P1412...,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101384,0,Q98774,2017-06-06 14:59:40+00:00,38324,,Olaf Kosinsky


### Load manual label matchers

In [7]:
df_manual_labels = pd.read_csv(os.path.join(cfg.get("directory", "dataset"), cfg.get("dataset", "wikidata_aux"), cfg.get("dataset", "wikidata_manual_labels")), engine='python')
df_manual_labels['match'] = df_manual_labels['match'].apply(lambda x: " " + x.strip() + " ")

In [8]:
comment_regex = re.compile(r"\/\* (.*?):(\d*)\|(.*?)( \*\/|\|)")

def split_comment(comment):
    if comment == "" or pd.isna(comment):
        base_action = ""
        digit = ""
        language_code = ""
    else:
        match = re.match(comment_regex, comment)
        if match is not None:
            base_action = match.group(1).strip()
            digit = match.group(2).strip()
            language_code = match.group(3).strip()
        else:
            # the re did not match. let's see if we can match anything from the manual labels
            comment_lower = comment.lower().replace(":", "").strip()
            comment_lower = " " + comment_lower + " "
            matches = df_manual_labels.loc[df_manual_labels['match'].apply(lambda x: x in comment_lower)] # look for words
            if len(matches) > 0:
                base_action = matches['label'].value_counts().idxmax() # take the most common label from the matches
            else:
                base_action = ""
            digit = ""
            language_code = ""
    return {
        "action_base": base_action,
        "action_digit": digit,
        "action_language": language_code
    }

### Extract labels

In [9]:
df_actions = pd.concat([df, pd.DataFrame.from_records(df['comment'].progress_apply(split_comment))], axis=1)

HBox(children=(IntProgress(value=0, max=136771513), HTML(value='')))




In [10]:
df_actions.head()

Unnamed: 0,comment,dataset_file,model,note,page_id,page_ns,page_title,timestamp,user_id,user_ip,user_name,action_base,action_digit,action_language
0,/* wbsetdescription-add:1|ar */ مؤرخ ألماني,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101383,0,Q98773,2017-06-23 19:37:19+00:00,2437879,,Benseid Seid,wbsetdescription-add,1,ar
1,"/* wbsetlabel-add:1|sl */ Gerhard Raff, #quick...",wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101383,0,Q98773,2018-01-29 13:38:28+00:00,23475,,Sporti,wbsetlabel-add,1,sl
2,/* wbsetsitelink-add:1|enwiki */ Wolfram Wette,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101384,0,Q98774,2017-01-04 17:08:50+00:00,17848,,Assayer,wbsetsitelink-add,1,enwiki
3,/* wbsetreference-add:2| */ [[Property:P937]]:...,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101384,0,Q98774,2017-06-06 08:21:58+00:00,38324,,Olaf Kosinsky,wbsetreference-add,2,
4,/* wbcreateclaim-create:1| */ [[Property:P1412...,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101384,0,Q98774,2017-06-06 14:59:40+00:00,38324,,Olaf Kosinsky,wbcreateclaim-create,1,


In [11]:
df_lookup = df_actions.loc[(df_actions["action_base"] == "")]

In [12]:
df_lookup.loc[df_lookup["comment"] != ""].head()

Unnamed: 0,comment,dataset_file,model,note,page_id,page_ns,page_title,timestamp,user_id,user_ip,user_name,action_base,action_digit,action_language
15073,"""Label"" aktualisiert",wikidatawiki-20190101-pages-meta-history1.xml-...,wikitext,,102766,1198,Translations:Wikidata:Glossary/31/de,2017-05-21 14:13:40+00:00,8206,,JakobVoss,,,
30718,"Replaced content with ""mw.loader.using(['media...",wikidatawiki-20190101-pages-meta-history1.xml-...,javascript,,104205,2,User:Danmichaelo/common.js,2017-07-27 04:43:00+00:00,4107,,Danmichaelo,,,
36098,/* Angående betjener (P121) */ new section,wikidatawiki-20190101-pages-meta-history1.xml-...,wikitext,,104609,3,User talk:Danmichaelo,2017-07-07 10:00:52+00:00,1910369,,Pmt,,,
36099,/* Angående betjener (P121) */,wikidatawiki-20190101-pages-meta-history1.xml-...,wikitext,,104609,3,User talk:Danmichaelo,2017-07-07 10:48:32+00:00,4107,,Danmichaelo,,,
36101,/* Angående betjener (P121) */,wikidatawiki-20190101-pages-meta-history1.xml-...,wikitext,,104609,3,User talk:Danmichaelo,2017-07-07 10:52:15+00:00,1910369,,Pmt,,,


### Remove unmatched revisions

In [13]:
print("Pre unmatched comments", len(df_actions))
df_actions = df_actions.loc[~(df_actions["action_base"] == "")].reset_index(drop=True)
print("Post unmatched comments", len(df_actions))

Pre unmatched comments 136771513
Post unmatched comments 136418103


In [14]:
num_labels = len(df_actions['action_base'].unique())
print("Num Labels {n}".format(n=num_labels))

Num Labels 69


### Remove all users with a low number of changes (below threshold)

In [15]:
change_threshold = cfg.getint("preprocessing", "min_num_changes")

In [16]:
df_num_changes = df_actions["user_id"].value_counts()

In [17]:
# look at the users and the number of changes. we will try to detect bots and filter for them later!
df_num_changes

2883061    11911579
133116     10056849
1822        3818507
78009       3522027
110062      3085793
609373      2891172
862070      2767516
2886706     2499565
24057       2162074
1062        2018819
171448      1983772
145693      1913672
4115        1598006
23475       1574515
44949       1570616
115508      1330309
1134        1098495
2727990     1078322
38324       1052621
4943        1014233
7150         982441
220959       948067
2580335      862762
9712         859188
609198       841028
3361         836339
2701887      828695
768608       781368
3402         744663
2029201      712463
             ...   
2964115           1
2817099           1
2845311           1
3038553           1
2953803           1
1165888           1
2858180           1
2868421           1
2800226           1
1603539           1
2810815           1
2709799           1
2947527           1
3053121           1
2999125           1
3011466           1
2964516           1
2847982           1
2997439           1


In [18]:
print("# Revisions before: {n}".format(n=len(df_actions)))
valid_users = df_num_changes.loc[df_num_changes < change_threshold].index
df_actions = df_actions.loc[~df_actions['user_id'].isin(valid_users)].reset_index(drop=True)
print("# Revisions before: {n}".format(n=len(df_actions)))

# Revisions before: 136418103
# Revisions before: 136418103


### Store Data

In [19]:
df_actions.to_pickle(os.path.join(cfg.get("directory", "exchange"), "[wikidata]002_extract_actions.p"))