In [1]:
import os
import pandas as pd

In [2]:
from configparser import ConfigParser
cfg = ConfigParser()
cfg.read("config.cfg")

['config.cfg']

In [3]:
from tqdm._tqdm_notebook import tqdm_notebook as tqdm # we manually import the notebook submodule as the normal one struggles with jupyterlab
tqdm.pandas() # this enables us to use progress_apply instead of apply

In [4]:
df = pd.read_pickle(os.path.join(cfg.get("directory", "exchange"), "[wikidata]002_extract_actions.p"))

In [5]:
df.head()

Unnamed: 0,comment,dataset_file,model,note,page_id,page_ns,page_title,timestamp,user_id,user_ip,user_name,action_base,action_digit,action_language
0,/* wbsetdescription-add:1|ar */ مؤرخ ألماني,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101383,0,Q98773,2017-06-23 19:37:19+00:00,2437879,,Benseid Seid,wbsetdescription-add,1,ar
1,"/* wbsetlabel-add:1|sl */ Gerhard Raff, #quick...",wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101383,0,Q98773,2018-01-29 13:38:28+00:00,23475,,Sporti,wbsetlabel-add,1,sl
2,/* wbsetsitelink-add:1|enwiki */ Wolfram Wette,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101384,0,Q98774,2017-01-04 17:08:50+00:00,17848,,Assayer,wbsetsitelink-add,1,enwiki
3,/* wbsetreference-add:2| */ [[Property:P937]]:...,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101384,0,Q98774,2017-06-06 08:21:58+00:00,38324,,Olaf Kosinsky,wbsetreference-add,2,
4,/* wbcreateclaim-create:1| */ [[Property:P1412...,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101384,0,Q98774,2017-06-06 14:59:40+00:00,38324,,Olaf Kosinsky,wbcreateclaim-create,1,


### Load mapping if we have one

In [6]:
label_dict = {}
mapping_file = os.path.join(cfg.get("directory", "dataset"), cfg.get("dataset", "wikidata_aux"), cfg.get("dataset", "wikidata_full_labels"))
if os.path.isfile(mapping_file): # load it manually if we have a mapping already
    df_label = pd.read_csv(mapping_file)
    for i, r in df_label.iterrows():
        label_dict[r['action_base']] = r['label']
else:
    df_label = pd.DataFrame({"action_base": [], "label": [], "label_readable": []})
# label_dict

In [7]:
action_counts = df['action_base'].value_counts()

### Manual mapping for undefined actions

Wikidata API reference: https://www.mediawiki.org/wiki/Wikibase/API/en#wblinktitles

In [15]:
df_label

Unnamed: 0,action_base,label,label_readable
0,wbcreateclaim-create,CLAIM_CREATE,Create Claim
1,wbsetdescription-add,DESCRIPTION_ADD,Add Description
2,wbsetreference-add,REFERENCE_ADD,Add Reference
3,wbsetclaim-create,CLAIM_CREATE,Create Claim
4,wbsetlabel-add,LABEL_ADD,Add Label
5,wbsetdescription-set,DESCRIPTION_UPDATE,Edit Description
6,wbeditentity-update,ENTITY_UPDATE,Edit Item
7,wbsetqualifier-add,QUALIFIER_ADD,Add Qualifier
8,wbsetsitelink-add,SITELINK_ADD,Add Sitelink
9,wbsetclaim-update,CLAIM_UPDATE,Edit Claim


In [9]:
for a,c in action_counts.iteritems():
    if a not in df_label['action_base'].values:
        print("({c}) Action: {a}".format(a=a, c=c))
        label = input("Abstract Label")
        label_readable = input("Human Readable Label")
        df_label.append({'action_base' : a, "label": label, "label_readable": label_readable} , ignore_index=True)

In [10]:
for label in df_label['label'].unique():
    print("#"*50)
    print(label, end=": ")
    selection =  df_label.loc[(df_label['label'] == label) & (df_label['label_readable'] != "")]
    if len(selection['label_readable'].unique()) == 1:
        df_label.loc[df_label['label'] == label, "label_readable"] = selection.iloc[0]["label_readable"]
        print(selection.iloc[0]["label_readable"])
        continue
    elif len(selection['label_readable'].unique()) > 1:
        print("Conflicting labels:")
        print(selection['label_readable'].values)
        
    label_readable = input("Readable Label:")
    df_label.loc[df_label['label'] == label, "label_readable"] = label_readable

##################################################
CLAIM_CREATE: Create Claim
##################################################
DESCRIPTION_ADD: Add Description
##################################################
REFERENCE_ADD: Add Reference
##################################################
LABEL_ADD: Add Label
##################################################
DESCRIPTION_UPDATE: Edit Description
##################################################
ENTITY_UPDATE: Edit Item
##################################################
QUALIFIER_ADD: Add Qualifier
##################################################
SITELINK_ADD: Add Sitelink
##################################################
CLAIM_UPDATE: Edit Claim
##################################################
CLAIM_REMOVE: Remove Claim
##################################################
ENTITY_CREATE: Create Item
##################################################
LABEL_UPDATE: Edit Label
##################################################
ALIAS

In [14]:
# drop the unamed 0 if you introduced it through some loading/unloading

if "Unnamed: 0" in df_label.columns:
    df_label = df_label.drop("Unnamed: 0", axis=1)

In [16]:
# store labels
df_label.to_csv(mapping_file, index=False)

### Add label to revision

In [17]:
len(df)

144308536

In [18]:
df = df.merge(right=df_label, how="left", on="action_base")

In [19]:
df.head()

Unnamed: 0,comment,dataset_file,model,note,page_id,page_ns,page_title,timestamp,user_id,user_ip,user_name,action_base,action_digit,action_language,label,label_readable
0,/* wbsetdescription-add:1|ar */ مؤرخ ألماني,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101383,0,Q98773,2017-06-23 19:37:19+00:00,2437879,,Benseid Seid,wbsetdescription-add,1,ar,DESCRIPTION_ADD,Add Description
1,"/* wbsetlabel-add:1|sl */ Gerhard Raff, #quick...",wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101383,0,Q98773,2018-01-29 13:38:28+00:00,23475,,Sporti,wbsetlabel-add,1,sl,LABEL_ADD,Add Label
2,/* wbsetsitelink-add:1|enwiki */ Wolfram Wette,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101384,0,Q98774,2017-01-04 17:08:50+00:00,17848,,Assayer,wbsetsitelink-add,1,enwiki,SITELINK_ADD,Add Sitelink
3,/* wbsetreference-add:2| */ [[Property:P937]]:...,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101384,0,Q98774,2017-06-06 08:21:58+00:00,38324,,Olaf Kosinsky,wbsetreference-add,2,,REFERENCE_ADD,Add Reference
4,/* wbcreateclaim-create:1| */ [[Property:P1412...,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101384,0,Q98774,2017-06-06 14:59:40+00:00,38324,,Olaf Kosinsky,wbcreateclaim-create,1,,CLAIM_CREATE,Create Claim


In [20]:
labels_to_remove = df['label'].value_counts()
labels_to_remove = list(labels_to_remove.loc[labels_to_remove <= cfg.getint("preprocessing", "min_label_occurrency")].index)
labels_to_remove

['SITELINK_BADGE']

In [21]:
print("Num revisions before removal: {n}".format(n=len(df)))
df = df.loc[~df['label'].isin(labels_to_remove)].reset_index(drop=True)
print("Num revisions after removal: {n}".format(n=len(df)))

Num revisions before removal: 144308536
Num revisions after removal: 144308535


In [22]:
print("Num unique labels: {n}".format(n=len(df['label'].unique())))

Num unique labels: 32


In [23]:
df['label'].value_counts()

CLAIM_CREATE          47888513
DESCRIPTION_ADD       29853091
REFERENCE_ADD         13134584
LABEL_ADD             11397616
DESCRIPTION_UPDATE     8473641
ENTITY_UPDATE          5942594
QUALIFIER_ADD          4845621
SITELINK_ADD           4604349
CLAIM_UPDATE           3210528
CLAIM_REMOVE           2994509
ENTITY_CREATE          2979071
ALIAS_ADD              1687516
LABEL_UPDATE           1678385
MERGE                  1508571
SITELINK_UPDATE        1321034
SITELINK_REMOVE         720938
ENTITY_REDIRECT         616856
CLAIM_UPDATEVALUE       257965
REVERT                  225706
ENTITY_OVERRIDE         202625
DESCRIPTION_REMOVE      184359
ALIAS_UPDATE            177824
REVERT_MANUAL           128532
LABEL_REMOVE             89942
ALIAS_REMOVE             89086
QUALIFIER_REMOVE         42720
REFERENCE_REMOVE         22849
QUALIFIER_UPDATE         22463
REFERENCE_UPDATE          5092
PROTECT_MANUAL            1490
PROPERTY_CREATE            452
REMOVE_MANUAL               13
Name: la

In [25]:
print("Num unique labels (readable): {n}".format(n=len(df['label_readable'].unique())))
df['label_readable'].value_counts()

Num unique labels (readable): 31


Create Claim          47888513
Add Description       29853091
Add Reference         13134584
Add Label             11397616
Edit Description       8473641
Edit Item              5942594
Add Qualifier          4845621
Add Sitelink           4604349
Edit Claim             3210528
Remove Claim           2994509
Create Item            2979071
Add Item Alias         1687516
Edit Label             1678385
Merge Items            1508571
Edit Sitelink          1321034
Remove Sitelink         720938
Redirect Item           616856
Revert Item             354238
Edit Claim Value        257965
Override Item           202625
Remove Description      184359
Edit Alias              177824
Remove Label             89942
Remove Alias             89086
Remove Qualifier         42720
Remove Reference         22849
Edit Qualifier           22463
Edit Reference            5092
Protect Item              1490
Create Property            452
Remove Item                 13
Name: label_readable, dtype: int64

### Store data

In [24]:
df.to_pickle(os.path.join(cfg.get("directory", "exchange"), "[wikidata]003_assign_labels.p"))