In [1]:
import os
import pandas as pd

In [2]:
from configparser import ConfigParser
cfg = ConfigParser()
cfg.read("config.cfg")

['config.cfg']

In [None]:
from tqdm._tqdm_notebook import tqdm_notebook as tqdm # we manually import the notebook submodule as the normal one struggles with jupyterlab
tqdm.pandas() # this enables us to use progress_apply instead of apply

In [None]:
df = pd.read_pickle(os.path.join(cfg.get("directory", "exchange"), "[wikidata]002_extract_actions.p"))

In [None]:
df.head()

### Load mapping if we have one

In [None]:
label_dict = {}
mapping_file = os.path.join(cfg.get("directory", "dataset"), cfg.get("dataset", "wikidata_aux"), cfg.get("dataset", "wikidata_full_labels"))
if os.path.isfile(mapping_file): # load it manually if we have a mapping already
    df_labels_load = pd.read_csv(mapping_file)
    for i, r in df_labels_load.iterrows():
        label_dict[r['action_base']] = r['label']
# label_dict

In [None]:
action_counts = df['action_base'].value_counts()

### Manual mapping for undefined actions

Wikidata API reference: https://www.mediawiki.org/wiki/Wikibase/API/en#wblinktitles

In [None]:
for a,c in action_counts.iteritems():
    if a not in label_dict:
        print(a, c)
        label = input()
        label_dict[a] = label

In [None]:
df_label = pd.DataFrame({"action_base": list(label_dict.keys()), "label": list(label_dict.values())})

In [None]:
# store labels
df_label.to_csv(mapping_file, index=False)

### Add label to revision

In [None]:
len(df)

In [None]:
df = df.merge(right=df_label, how="left", on="action_base")

In [None]:
df.head()

In [None]:
labels_to_remove = df['label'].value_counts()
labels_to_remove = list(labels_to_remove.loc[labels_to_remove <= cfg.getint("preprocessing", "min_label_occurrency")].index)
labels_to_remove

In [None]:
print("Num revisions before removal: {n}".format(n=len(df)))
df = df.loc[~df['label'].isin(labels_to_remove)].reset_index(drop=True)
print("Num revisions after removal: {n}".format(n=len(df)))

In [None]:
print("Num unique labels: {n}".format(n=len(df['label'].unique())))

In [None]:
df['label'].value_counts()

### Store data

In [None]:
df.to_pickle(os.path.join(cfg.get("directory", "exchange"), "[wikidata]003_assign_labels.p"))