### Load Config

In [1]:
import os
import pandas as pd

In [2]:
from configparser import ConfigParser
cfg = ConfigParser()
cfg.read("config.cfg")

['config.cfg']

In [3]:
pickle_path = cfg.get("directory", "pickles")
dataframes = [pd.read_pickle(os.path.join(pickle_path, f)) for f in sorted(os.listdir(pickle_path)) if "df_revisions" in f]

In [4]:
df = pd.concat(dataframes, ignore_index=True)

In [5]:
df.head()

Unnamed: 0,comment,dataset_file,model,note,page_id,page_ns,page_title,timestamp,user_id,user_ip,user_name
0,/* wbeditentity-update:0| */ BOT - Adding desc...,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101383,0,Q98773,2017-02-16 08:23:44+00:00,6811,,Emijrpbot
1,/* wbsetdescription-add:1|ar */ مؤرخ ألماني,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101383,0,Q98773,2017-06-23 19:37:19+00:00,2437879,,Benseid Seid
2,/* wbeditentity-update:0| */ BOT - Adding desc...,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101383,0,Q98773,2017-07-01 23:05:50+00:00,6811,,Emijrpbot
3,/* wbeditentity-update:0| */ BOT - Adding desc...,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101383,0,Q98773,2017-09-15 09:33:31+00:00,6811,,Emijrpbot
4,/* wbcreateclaim-create:1| */ [[Property:P373]...,wikidatawiki-20190101-pages-meta-history1.xml-...,wikibase-item,,101383,0,Q98773,2017-09-21 22:22:39+00:00,2439297,,JhealdBatch


In [6]:
len(df.loc[~(df['user_ip']=="")])

1381279

### Remove Anonymous Users

== Users with no username (np.NaN) but an IP

In [7]:
print("# entries with anonymous users: {n}".format(n=len(df)))
df = df.loc[~pd.isna(df['user_id'])].reset_index(drop=True)
print("# entries without anonymous users: {n}".format(n=len(df)))

# entries with anonymous users: 391642390
# entries without anonymous users: 390261111


### Remove Bots

In [8]:
# fetch files and merge lists

aux_files = os.path.join(cfg.get("directory", "dataset"), cfg.get("dataset", "wikidata_aux"))
bot_files = [os.path.join(aux_files, f) for f in os.listdir(aux_files) if "[BOTS]" in f]

bots = set([])

for f in bot_files:
    with open(f, "r") as bot_file:
        bots = bots.union(set([x.strip() for x in bot_file.readlines()]))
        

print("# bots: {n}".format(n=len(bots)))

# bots: 424


In [9]:
# strip bots and reset index to make it go faster
print("# entries with bots: {n}".format(n=len(df)))
df = df.loc[~df['user_name'].isin(bots)].reset_index(drop=True)
print("# entries without bots: {n}".format(n=len(df)))
print("# users excluding bots: {n}".format(n=len(df['user_id'].unique())))

# entries with bots: 390261111
# entries without bots: 141998966
# users excluding bots: 130494


### Store Data

In [11]:
df.to_pickle(os.path.join(cfg.get("directory", "exchange"), "[wikidata]001_dataframe_full.p"))