In [None]:
import os
import pandas as pd

In [None]:
import re

In [None]:
from configparser import ConfigParser
cfg = ConfigParser()
cfg.read("config.cfg")

In [None]:
from tqdm._tqdm_notebook import tqdm_notebook as tqdm # we manually import the notebook submodule as the normal one struggles with jupyterlab
tqdm.pandas() # this enables us to use progress_apply instead of apply

### Load previous data

In [None]:
df = pd.read_pickle(os.path.join(cfg.get("directory", "exchange"), "[wikidata]001_dataframe_full.p"))

In [None]:
df.head()

### Load manual label matchers

In [None]:
df_manual_labels = pd.read_csv(os.path.join(cfg.get("directory", "dataset"), cfg.get("dataset", "wikidata_aux"), cfg.get("dataset", "wikidata_manual_labels")), engine='python')
df_manual_labels['match'] = df_manual_labels['match'].apply(lambda x: " " + x.strip() + " ")

In [None]:
comment_regex = re.compile(r"\/\* (.*?):(\d*)\|(.*?)( \*\/|\|)")

def split_comment(comment):
    if comment == "" or pd.isna(comment):
        base_action = ""
        digit = ""
        language_code = ""
    else:
        match = re.match(comment_regex, comment)
        if match is not None:
            base_action = match.group(1).strip()
            digit = match.group(2).strip()
            language_code = match.group(3).strip()
        else:
            # the re did not match. let's see if we can match anything from the manual labels
            comment_lower = comment.lower().replace(":", "").strip()
            comment_lower = " " + comment_lower + " "
            matches = df_manual_labels.loc[df_manual_labels['match'].apply(lambda x: x in comment_lower)] # look for words
            if len(matches) > 0:
                base_action = matches['label'].value_counts().idxmax() # take the most common label from the matches
            else:
                base_action = ""
            digit = ""
            language_code = ""
    return {
        "action_base": base_action,
        "action_digit": digit,
        "action_language": language_code
    }

### Extract labels

In [None]:
df_actions = pd.concat([df, pd.DataFrame.from_records(df['comment'].progress_apply(split_comment))], axis=1)

In [None]:
df_actions.head()

### Remove unmatched revisions

In [None]:
print("Pre unmatched comments", len(df_actions))
df_actions = df_actions.loc[~(df_actions["action_base"] == "")].reset_index(drop=True)
print("Post unmatched comments", len(df_actions))

In [None]:
num_labels = len(df_actions['action_base'].unique())
print("Num Labels {n}".format(n=num_labels))

### Remove all users with a low number of changes (below threshold)

In [None]:
change_threshold = cfg.getint("preprocessing", "min_num_changes")

In [None]:
df_num_changes = df_actions["user_id"].value_counts()

In [None]:
print("# Revisions before: {n}".format(n=len(df_actions)))
df_actions = df_actions.loc[~df_actions['user_id'].isin(df_num_changes.loc[df_num_changes < change_threshold].index)].reset_index(drop=True)
print("# Revisions before: {n}".format(n=len(df_actions)))

### Store Data

In [None]:
df_actions.to_pickle(os.path.join(cfg.get("directory", "exchange"), "[wikidata]002_extract_actions.p"))