In [1]:
import os
import pandas as pd

In [2]:
from configparser import ConfigParser
cfg = ConfigParser()
cfg.read("config.cfg")

['config.cfg']

In [3]:
from tqdm._tqdm_notebook import tqdm_notebook as tqdm # we manually import the notebook submodule as the normal one struggles with jupyterlab
tqdm.pandas() # this enables us to use progress_apply instead of apply

In [4]:
df = pd.read_pickle(os.path.join(cfg.get("directory", "exchange"), "[wikidata]002_extract_actions.p"))

In [5]:
df.head()

Unnamed: 0,comment,dataset_file,model,note,page_id,page_ns,page_title,timestamp,user_id,user_ip,user_name,action_base,action_digit,action_language
0,/* wbsetdescription-add:1|ar */ مؤرخ ألماني,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101383,0,Q98773,2017-06-23 19:37:19+00:00,2437879,,Benseid Seid,wbsetdescription-add,1,ar
1,"/* wbsetlabel-add:1|sl */ Gerhard Raff, #quick...",wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101383,0,Q98773,2018-01-29 13:38:28+00:00,23475,,Sporti,wbsetlabel-add,1,sl
2,/* wbsetsitelink-add:1|enwiki */ Wolfram Wette,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101384,0,Q98774,2017-01-04 17:08:50+00:00,17848,,Assayer,wbsetsitelink-add,1,enwiki
3,/* wbsetreference-add:2| */ [[Property:P937]]:...,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101384,0,Q98774,2017-06-06 08:21:58+00:00,38324,,Olaf Kosinsky,wbsetreference-add,2,
4,/* wbcreateclaim-create:1| */ [[Property:P1412...,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101384,0,Q98774,2017-06-06 14:59:40+00:00,38324,,Olaf Kosinsky,wbcreateclaim-create,1,


### Load mapping if we have one

In [6]:
label_dict = {}
mapping_file = os.path.join(cfg.get("directory", "dataset"), cfg.get("dataset", "wikidata_aux"), cfg.get("dataset", "wikidata_full_labels"))
if os.path.isfile(mapping_file): # load it manually if we have a mapping already
    df_labels_load = pd.read_csv(mapping_file)
    for i, r in df_labels_load.iterrows():
        label_dict[r['action_base']] = r['label']
# label_dict

In [7]:
action_counts = df['action_base'].value_counts()

### Manual mapping for undefined actions

Wikidata API reference: https://www.mediawiki.org/wiki/Wikibase/API/en#wblinktitles

In [8]:
for a,c in action_counts.iteritems():
    if a not in label_dict:
        print(a, c)
        label = input()
        label_dict[a] = label

In [9]:
df_label = pd.DataFrame({"action_base": list(label_dict.keys()), "label": list(label_dict.values())})

In [10]:
# store labels
df_label.to_csv(mapping_file, index=False)

### Add label to revision

In [11]:
len(df)

141651850

In [12]:
df = df.merge(right=df_label, how="left", on="action_base")

In [13]:
df.head()

Unnamed: 0,comment,dataset_file,model,note,page_id,page_ns,page_title,timestamp,user_id,user_ip,user_name,action_base,action_digit,action_language,label
0,/* wbsetdescription-add:1|ar */ مؤرخ ألماني,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101383,0,Q98773,2017-06-23 19:37:19+00:00,2437879,,Benseid Seid,wbsetdescription-add,1,ar,DESCRIPTION_ADD
1,"/* wbsetlabel-add:1|sl */ Gerhard Raff, #quick...",wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101383,0,Q98773,2018-01-29 13:38:28+00:00,23475,,Sporti,wbsetlabel-add,1,sl,LABEL_ADD
2,/* wbsetsitelink-add:1|enwiki */ Wolfram Wette,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101384,0,Q98774,2017-01-04 17:08:50+00:00,17848,,Assayer,wbsetsitelink-add,1,enwiki,SITELINK_ADD
3,/* wbsetreference-add:2| */ [[Property:P937]]:...,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101384,0,Q98774,2017-06-06 08:21:58+00:00,38324,,Olaf Kosinsky,wbsetreference-add,2,,REFERENCE_ADD
4,/* wbcreateclaim-create:1| */ [[Property:P1412...,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101384,0,Q98774,2017-06-06 14:59:40+00:00,38324,,Olaf Kosinsky,wbcreateclaim-create,1,,CLAIM_CREATE


In [14]:
labels_to_remove = df['label'].value_counts()
labels_to_remove = list(labels_to_remove.loc[labels_to_remove <= cfg.getint("preprocessing", "min_label_occurrency")].index)
labels_to_remove

['SITELINK_BADGE']

In [15]:
print("Num revisions before removal: {n}".format(n=len(df)))
df = df.loc[~df['label'].isin(labels_to_remove)].reset_index(drop=True)
print("Num revisions after removal: {n}".format(n=len(df)))

Num revisions before removal: 141651850
Num revisions after removal: 141651849


In [16]:
print("Num unique labels: {n}".format(n=len(df['label'].unique())))

Num unique labels: 43


In [17]:
df['label'].value_counts()

CLAIM_CREATE          46757724
DESCRIPTION_ADD       29508789
REFERENCE_ADD         12821614
LABEL_ADD             11260870
DESCRIPTION_UPDATE     8327748
ENTITY_UPDATE          5824128
QUALIFIER_ADD          4742348
SITELINK_ADD           4561653
CLAIM_UPDATE           3116188
CLAIM_REMOVE           2941439
ENTITY_CREATE          2795814
LABEL_UPDATE           1653396
ALIAS_ADD              1650484
MERGE                  1488812
SITELINK_UPDATE        1308941
SITELINK_REMOVE         715639
ENTITY_REDIRECT         600534
CLAIM_UPDATEVALUE       257311
REVERT                  216891
ENTITY_OVERRIDE         201525
DESCRIPTION_REMOVE      183477
ALIAS_UPDATE            175176
REVERT_MANUAL           128003
CREATE_MANUAL            97442
LABEL_REMOVE             88956
ALIAS_REMOVE             87856
QUALIFIER_REMOVE         40284
REFERENCE_REMOVE         22487
QUALIFIER_UPDATE         22429
FORM_ADD                 20429
SENSE_ADD                 7446
LEXEME_CREATE             7110
REFERENC

### Store data

In [18]:
df.to_pickle(os.path.join(cfg.get("directory", "exchange"), "[wikidata]003_assign_labels.p"))