### Load Config

In [None]:
import os
import pandas as pd

In [None]:
from configparser import ConfigParser
cfg = ConfigParser()
cfg.read("config.cfg")

In [None]:
pickle_path = cfg.get("directory", "pickles")
dataframes = [pd.read_pickle(os.path.join(pickle_path, f)) for f in sorted(os.listdir(pickle_path)) if "df_revisions" in f]

In [None]:
df = pd.concat(dataframes, ignore_index=True)

In [None]:
df.head()

In [None]:
len(df.loc[~(df['user_ip']=="")])

### Remove Anonymous Users

== Users with no username (np.NaN) but an IP

In [None]:
print("# entries with anonymous users: {n}".format(n=len(df)))
df = df.loc[~pd.isna(df['user_id'])].reset_index(drop=True)
print("# entries without anonymous users: {n}".format(n=len(df)))

### Remove Bots

In [None]:
# fetch files and merge lists

aux_files = os.path.join(cfg.get("directory", "dataset"), cfg.get("dataset", "wikidata_aux"))
bot_files = [os.path.join(aux_files, f) for f in os.listdir(aux_files) if "[BOTS]" in f]

bots = set([])

for f in bot_files:
    with open(f, "r") as bot_file:
        bots = bots.union(set([x.strip() for x in bot_file.readlines()]))
        

print("# bots: {n}".format(n=len(bots)))

In [None]:
# strip bots and reset index to make it go faster
print("# entries with bots: {n}".format(n=len(df)))
df = df.loc[~df['user_name'].isin(bots)].reset_index(drop=True)
print("# entries without bots: {n}".format(n=len(df)))
print("# users excluding bots: {n}".format(n=len(df['user_id'].unique())))

### Remove Users with a single edit

In [None]:
counts = df['user_id'].value_counts()

### Store Data

In [None]:
df.to_pickle(os.path.join(cfg.get("directory", "exchange"), "[wikidata]001_dataframe_full.p"))