In [1]:
import os
import pandas as pd

In [2]:
from configparser import ConfigParser
cfg = ConfigParser()
cfg.read("config.cfg")

['config.cfg']

In [3]:
from tqdm._tqdm_notebook import tqdm_notebook as tqdm # we manually import the notebook submodule as the normal one struggles with jupyterlab
tqdm.pandas() # this enables us to use progress_apply instead of apply

In [4]:
df = pd.read_pickle(os.path.join(cfg.get("directory", "exchange"), "[wikidata]002_extract_actions.p"))

In [5]:
df.head()

Unnamed: 0,comment,dataset_file,model,note,page_id,page_ns,page_title,timestamp,user_id,user_ip,user_name,action_base,action_digit,action_language
0,/* wbsetdescription-add:1|ar */ مؤرخ ألماني,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101383,0,Q98773,2017-06-23 19:37:19+00:00,2437879,,Benseid Seid,wbsetdescription-add,1,ar
1,"/* wbsetlabel-add:1|sl */ Gerhard Raff, #quick...",wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101383,0,Q98773,2018-01-29 13:38:28+00:00,23475,,Sporti,wbsetlabel-add,1,sl
2,/* wbsetsitelink-add:1|enwiki */ Wolfram Wette,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101384,0,Q98774,2017-01-04 17:08:50+00:00,17848,,Assayer,wbsetsitelink-add,1,enwiki
3,/* wbsetreference-add:2| */ [[Property:P937]]:...,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101384,0,Q98774,2017-06-06 08:21:58+00:00,38324,,Olaf Kosinsky,wbsetreference-add,2,
4,/* wbcreateclaim-create:1| */ [[Property:P1412...,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101384,0,Q98774,2017-06-06 14:59:40+00:00,38324,,Olaf Kosinsky,wbcreateclaim-create,1,


### Load mapping if we have one

In [1]:
label_dict = {}
mapping_file = os.path.join(cfg.get("directory", "dataset"), cfg.get("dataset", "wikidata_aux"), cfg.get("dataset", "wikidata_full_labels"))
if os.path.isfile(mapping_file): # load it manually if we have a mapping already
    df_label = pd.read_csv(mapping_file)
    for i, r in df_label.iterrows():
        label_dict[r['action_base']] = r['label']
else:
    df_label = pd.DataFrame({"action_base": [], "label": [] "label_readable": []})
# label_dict

NameError: name 'os' is not defined

In [7]:
action_counts = df['action_base'].value_counts()

### Manual mapping for undefined actions

Wikidata API reference: https://www.mediawiki.org/wiki/Wikibase/API/en#wblinktitles

In [8]:
for a,c in action_counts.iteritems():
    if a not in df_label['action_base']:
        print("({c}) Action: {a}".format(a=a, c=c)
        label = input("Abstract Label")
        label_readable = input("Human Readable Label")
        df_label.append({'action_base' : a, "label": label, "label_readable": label_readable} , ignore_index=True)

In [None]:
for label in df['label'].unique():
    print("#"*50)
    print(label)
    selection =  df.loc[(df['label'] == label) & (df['label_readable'] != "")]
    if len(selection) == 1:
        label.loc[df['label'] == label, "label_readable"] = selection.iloc[0]["label_readable"]
        continue
    elif len(selection) > 1:
        print("Conflicting labels:")
        print(selection['label_readable'].values)
        
    label_readable = input("Readable Label:")
    df.loc[df['label'] == label, "label_readable"] = label_readable

In [10]:
# store labels
df_label.to_csv(mapping_file, index=False)

### Add label to revision

In [11]:
len(df)

143923897

In [12]:
df = df.merge(right=df_label, how="left", on="action_base")

In [13]:
df.head()

Unnamed: 0,comment,dataset_file,model,note,page_id,page_ns,page_title,timestamp,user_id,user_ip,user_name,action_base,action_digit,action_language,label
0,/* wbsetdescription-add:1|ar */ مؤرخ ألماني,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101383,0,Q98773,2017-06-23 19:37:19+00:00,2437879,,Benseid Seid,wbsetdescription-add,1,ar,DESCRIPTION_ADD
1,"/* wbsetlabel-add:1|sl */ Gerhard Raff, #quick...",wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101383,0,Q98773,2018-01-29 13:38:28+00:00,23475,,Sporti,wbsetlabel-add,1,sl,LABEL_ADD
2,/* wbsetsitelink-add:1|enwiki */ Wolfram Wette,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101384,0,Q98774,2017-01-04 17:08:50+00:00,17848,,Assayer,wbsetsitelink-add,1,enwiki,SITELINK_ADD
3,/* wbsetreference-add:2| */ [[Property:P937]]:...,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101384,0,Q98774,2017-06-06 08:21:58+00:00,38324,,Olaf Kosinsky,wbsetreference-add,2,,REFERENCE_ADD
4,/* wbcreateclaim-create:1| */ [[Property:P1412...,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101384,0,Q98774,2017-06-06 14:59:40+00:00,38324,,Olaf Kosinsky,wbcreateclaim-create,1,,CLAIM_CREATE


In [14]:
labels_to_remove = df['label'].value_counts()
labels_to_remove = list(labels_to_remove.loc[labels_to_remove <= cfg.getint("preprocessing", "min_label_occurrency")].index)
labels_to_remove

['SITELINK_BADGE']

In [15]:
print("Num revisions before removal: {n}".format(n=len(df)))
df = df.loc[~df['label'].isin(labels_to_remove)].reset_index(drop=True)
print("Num revisions after removal: {n}".format(n=len(df)))

Num revisions before removal: 143923897
Num revisions after removal: 143923896


In [16]:
print("Num unique labels: {n}".format(n=len(df['label'].unique())))

Num unique labels: 43


In [17]:
df['label'].value_counts()

CLAIM_CREATE          47566547
DESCRIPTION_ADD       29834716
REFERENCE_ADD         13066714
LABEL_ADD             11371639
DESCRIPTION_UPDATE     8470215
ENTITY_UPDATE          5940111
QUALIFIER_ADD          4798372
SITELINK_ADD           4618082
CLAIM_UPDATE           3195914
CLAIM_REMOVE           2984527
ENTITY_CREATE          2941530
ALIAS_ADD              1681944
LABEL_UPDATE           1669801
MERGE                  1499961
SITELINK_UPDATE        1326459
SITELINK_REMOVE         720635
ENTITY_REDIRECT         609020
CLAIM_UPDATEVALUE       257923
REVERT                  225718
ENTITY_OVERRIDE         202361
DESCRIPTION_REMOVE      184173
ALIAS_UPDATE            177752
REVERT_MANUAL           133913
CREATE_MANUAL           108871
LABEL_REMOVE             89778
ALIAS_REMOVE             88994
QUALIFIER_REMOVE         42604
FORM_ADD                 23802
REFERENCE_REMOVE         22849
QUALIFIER_UPDATE         22463
REMOVE_MANUAL            12520
LEXEME_CREATE             9502
SENSE_AD

### Store data

In [18]:
df.to_pickle(os.path.join(cfg.get("directory", "exchange"), "[wikidata]003_assign_labels.p"))