### Load Config

In [1]:
import os
import pandas as pd

In [2]:
from configparser import ConfigParser
cfg = ConfigParser()
cfg.read("config.cfg")

['config.cfg']

In [3]:
from tqdm._tqdm_notebook import tqdm_notebook as tqdm # we manually import the notebook submodule as the normal one struggles with jupyterlab
tqdm.pandas() # this enables us to use progress_apply instead of apply

In [None]:
pickle_path = cfg.get("directory", "pickles")
dataframes = [pd.read_pickle(os.path.join(pickle_path, f)) for f in tqdm(sorted(os.listdir(pickle_path))) if "df_revisions" in f and os.path.isfile(os.path.join(pickle_path, f))]

HBox(children=(IntProgress(value=0, max=561), HTML(value='')))

In [None]:
df = pd.concat(dataframes, ignore_index=True)

In [None]:
df.head()

### Filter for observation window

In [None]:
ts_min = pd.to_datetime(cfg.get("preprocessing", "timestamp_from"), utc=True)
ts_max = pd.to_datetime(cfg.get("preprocessing", "timestamp_to"), utc=True)

In [None]:
print("# entries total: {n}".format(n=len(df)))
df = df.loc[(ts_min <= df['timestamp']) & (df['timestamp'] <= ts_max)].reset_index(drop=True)
print("# entries obs window: {n}".format(n=len(df)))

### Remove Anonymous Users

== Users with no username (np.NaN) but technically have an IP

In [None]:
len(df.loc[pd.isna(df['user_id'])])

In [None]:
print("# entries with anonymous users: {n}".format(n=len(df)))
df = df.loc[~pd.isna(df['user_id'])].reset_index(drop=True)
print("# entries without anonymous users: {n}".format(n=len(df)))

### Remove Bots

In [None]:
# fetch files and merge lists

aux_files = os.path.join(cfg.get("directory", "dataset"), cfg.get("dataset", "wikidata_aux"))
bot_files = [os.path.join(aux_files, f) for f in os.listdir(aux_files) if "[BOTS]" in f]

bots = set([])

for f in bot_files:
    with open(f, "r") as bot_file:
        bots = bots.union(set([x.strip() for x in bot_file.readlines()]))
        

print("# bots: {n}".format(n=len(bots)))

In [None]:
# strip bots and reset index to make it go faster
print("# entries with bots: {n}".format(n=len(df)))
df = df.loc[~df['user_name'].isin(bots)].reset_index(drop=True)
print("# entries without bots: {n}".format(n=len(df)))
print("# users excluding bots: {n}".format(n=len(df['user_id'].unique())))

### Store Data

In [None]:
df.to_pickle(os.path.join(cfg.get("directory", "exchange"), "[wikidata]001_dataframe_full.p"))

In [None]:
print("done...")