### Load Config

In [1]:
import os
import pandas as pd

In [2]:
from configparser import ConfigParser
cfg = ConfigParser()
cfg.read("config.cfg")

['config.cfg']

In [3]:
from tqdm._tqdm_notebook import tqdm_notebook as tqdm # we manually import the notebook submodule as the normal one struggles with jupyterlab
tqdm.pandas() # this enables us to use progress_apply instead of apply

In [4]:
pickle_path = cfg.get("directory", "pickles")
dataframes = [pd.read_pickle(os.path.join(pickle_path, f)) for f in tqdm(sorted(os.listdir(pickle_path))) if "df_revisions" in f and os.path.isfile(os.path.join(pickle_path, f))]

HBox(children=(IntProgress(value=0, max=561), HTML(value='')))




In [5]:
df = pd.concat(dataframes, ignore_index=True)

In [6]:
df.head()

Unnamed: 0,comment,dataset_file,model,note,page_id,page_ns,page_title,timestamp,user_id,user_ip,user_name
0,/* wbsetentity */ Bötli: Ygfüegt: [[de:Gerhard...,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101383,0,Q98773,2012-11-24 07:37:30+00:00,3280,,MerlIwBot
1,,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101383,0,Q98773,2013-03-15 07:36:57+00:00,5312,,BeneBot*
2,,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101383,0,Q98773,2013-03-15 07:36:58+00:00,5312,,BeneBot*
3,,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101383,0,Q98773,2013-03-22 19:45:27+00:00,18825,,Legobot
4,/* wbsetlabel-set:1|nl */ Gerhard Raff,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101383,0,Q98773,2013-04-18 08:00:49+00:00,157561,,RobotMichiel1972


In [11]:
#convert this because the concat sometimes makes an object type out of it
df['page_ns'] = df['page_ns'].astype(np.int)

In [12]:
print(df['page_ns'].value_counts())

0      396992005
120       291592
Name: page_ns, dtype: int64


### Filter for observation window

In [8]:
ts_min = pd.to_datetime(cfg.get("preprocessing", "datetime_from"), utc=True)
ts_max = pd.to_datetime(cfg.get("preprocessing", "datetime_to"), utc=True)

In [9]:
print("# entries total: {n}".format(n=len(df)))
df = df.loc[(ts_min <= df['timestamp']) & (df['timestamp'] <= ts_max)].reset_index(drop=True)
print("# entries obs window: {n}".format(n=len(df)))

# entries total: 817730117
# entries obs window: 399831902


### Filter for target namespaces

In [10]:
#   0 ... item
# 120 ... property
print("# in all NS: {n}".format(n=len(df)))
df = df.loc[df["page_ns"].isin([0, 120])].reset_index(drop=True)
print("# entries in target NS: {n}".format(n=len(df)))

# in all NS: 399831902
# entries in target NS: 397283597


### Remove Anonymous Users

These are users with no username (np.NaN) but technically have an IP  
Here, we ignore these users. But future work may try to recreate users and sessions from IP and other available data

In [13]:
len(df.loc[pd.isna(df['user_id'])])

1416586

In [14]:
print("# entries with anonymous users: {n}".format(n=len(df)))
df = df.loc[~pd.isna(df['user_id'])].reset_index(drop=True)
print("# entries without anonymous users: {n}".format(n=len(df)))

# entries with anonymous users: 397283597
# entries without anonymous users: 395867011


### Remove Bots

In [None]:
# fetch files and merge lists

aux_files = os.path.join(cfg.get("directory", "dataset"), cfg.get("dataset", "wikidata_aux"))
bot_files = [os.path.join(aux_files, f) for f in os.listdir(aux_files) if "[BOTS]" in f]

bots = set([])

for f in bot_files:
    print("Bot file: {f}".format(f=f), end="...")
    if f.endswitch("[BOTS]manual.txt"): # we want to filter those later
        print("skipping")
        continue
    with open(f, "r") as bot_file:
        bots = bots.union(set([x.strip() for x in bot_file.readlines()]))
        print("done!")
        

print("# bots: {n}".format(n=len(bots)))

In [None]:
# strip bots and reset index to make it go faster
print("# entries with bots: {n}".format(n=len(df)))
df = df.loc[~df['user_name'].isin(bots)].reset_index(drop=True)
print("# entries without bots: {n}".format(n=len(df)))
print("# users excluding bots: {n}".format(n=len(df['user_id'].unique())))

### Store Data

In [None]:
df.to_pickle(os.path.join(cfg.get("directory", "exchange"), "[wikidata]001_dataframe_full.p"))