In [1]:
%load_ext ipycache

  from IPython.utils.traitlets import Unicode


In [2]:
# import core packages
import io
from IPython.core.interactiveshell import InteractiveShell
import itertools
import json
import multiprocessing as mul
import requests
from zipfile import ZipFile

# import third party dependencies
import numpy as np
import pandas as pd
import plotly.graph_objs as go
from plotly import tools
from plotly.offline import init_notebook_mode, iplot
import synapseclient
from synapseclient import Activity, File

# initialize plotly to use offline mode
init_notebook_mode(connected=True)

# adjust the notebook to print everything in the cell
InteractiveShell.ast_node_interactivity = 'all'

# login to my datastore
syn = synapseclient.Synapse()
syn.login()

  return f(*args, **kwds)
  return f(*args, **kwds)


Welcome, Luke Waninger!



### Data Load
Two data sources will be used:
To create these tables, we will draw from three data sources:
1. Wikipedia Article Data found on [Figshare](https://figshare.com/articles/Untitled_Item/5513449).
2. Population Data found at a random [DropBox](https://www.dropbox.com/s/5u7sy1xt7g0oi2c/WPDS_2018_data.csv?dl=0) location.

#### Figshare - Wikipedia Articles

In [3]:
# download the data from figshare
figshare = 'https://ndownloader.figshare.com/files/9614893'
r = requests.get(figshare)

# make sure the result is valid
if r.ok:
    # feed a byte stream into a ZipFile
    stream = io.BytesIO(r.content)
    zf = ZipFile(stream)
    
    # locate the csv file within the list of files embedded in the ZipFile generated above
    # I make sure to not include the files within the 'MAX OS' directory
    file = [
        f for f in zf.filelist if f.filename.find('page_data.csv') > 0 and f.filename.find('MAC') == -1
    ][0]
    
    # extract the csv file and read into a pandas dataframe
    page_data = pd.read_csv(zf.extract(file))

# print this if the request failed for some reason
else:
    print(f'failed to download page data: {r.status}')

page_data.head()

Unnamed: 0,page,country,rev_id
0,Template:ZambiaProvincialMinisters,Zambia,235107991
1,Bir I of Kanem,Chad,355319463
2,Template:Zimbabwe-politician-stub,Zimbabwe,391862046
3,Template:Uganda-politician-stub,Uganda,391862070
4,Template:Namibia-politician-stub,Namibia,391862409


#### Population Data - DropBox

In [4]:
# download the data from Drop Box
dropbox = 'https://www.dropbox.com/s/5u7sy1xt7g0oi2c/WPDS_2018_data.csv?dl=1'
r = requests.get(dropbox)

# make sure the result is valid
if r.ok:
    # this time, feed the csv byte stream into a pandas dataframe directly
    stream = io.BytesIO(r.content)
    pop = pd.read_csv(stream)

# print this if the request failed for some reason
else:
    print(f'failed to download population data: {r.status}')

pop.head()

Unnamed: 0,Geography,Population mid-2018 (millions)
0,AFRICA,1284.0
1,Algeria,42.7
2,Egypt,97.0
3,Libya,6.5
4,Morocco,35.2


Right at the start we see inconsistencies in the format of our joining keys - Geography and country. I run some quick string tools to ensure they're consistent through both datasets. The columns are renamed in order to make the joining easier to read. Additionally, I verify that the joining keys are, in fact, keys then perform an inner join to include only the countries that have data in both sets.

In [5]:
# explicitly rename the columns
pop.rename(columns={
    'Geography':'country',
    'Population mid-2018 (millions)':'population'
}, inplace=True)

page_data.rename(columns={
    'page':'article_name',
    'rev_id':'revision_id'
}, inplace=True)

# enforce string format consistency
pop.country = pop.country.apply(str.title)
page_data.country = page_data.country.apply(str.title)

# double check the 'keys' in each dataframe are in fact joinable keys
assert len(pd.unique(pop.country)) == len(pop)
assert len(pd.unique([(t.article_name, t.country) for t in page_data.itertuples()])) == len(page_data)

#Q convert the population to a more appropriate data type
pop.population = pop.population.apply(lambda x: float(x.replace(',', '')))

# merge the data frames
df = pop.merge(page_data, on='country', how='inner')

df.head()

Unnamed: 0,country,population,article_name,revision_id
0,Algeria,42.7,Template:Algeria-politician-stub,544347736
1,Algeria,42.7,Template:Algeria-diplomat-stub,567620838
2,Algeria,42.7,Template:AlgerianPres,665948270
3,Algeria,42.7,Ali Fawzi Rebaine,686269631
4,Algeria,42.7,Ahmed Attaf,705910185


### Make API calls to get articles predictions

Define a function we can use to make the API requests.

In [6]:
def get_ores_data(revision_ids):    
    # Define the endpoint
    endpoint = 'https://ores.wikimedia.org/v3/scores/{project}/?models={model}&revids={revids}'
    
    # Specify the parameters - joining the revision IDs together separated by | marks.
    params = {
        'project': 'enwiki',
        'model': 'wp10',
        'revids': '|'.join(str(x) for x in revision_ids)
    }
    
    # make the call and verify the response before proceeding
    response = requests.get(endpoint.format(**params))
    if response.ok:
        # convert the response to json
        response = response.json()
        
        # return the scores as a list of tuples, taking only the prediction
        results = []
        for rid in revision_ids:
            # start at the parent node we're interested in
            parent = response['enwiki']['scores'][str(rid)]['wp10']

            # check for any errors, append either the error or  prediction
            if 'error' in parent.keys():
                results.append((rid, parent['error']['type']))
            else:
                results.append((rid, parent['score']['prediction']))
            
        return results
    
    # if the request failed, return the status code and list of revision ids so we can retry later
    else:
        return dict(err=response.status, revision_ids=revision_ids)

Process the revision ids in batches of 50 to preclude Wikimedia blocking the request. I also run these requests in parallel and cache the results. There are over 45K rows in the dataframe generated above. This turns into a lot of requests so we don't want to make any call more than once and I'd rather not do it iteratively.

In [7]:
%%cache api_results.pkl results

# define how many rev_ids to include in each API call
step = 50

# start a task pool
pool = mul.Pool(mul.cpu_count())

# process the calls in parallel
results = list(pool.map(
    get_ores_data,
    [df.revision_id[i:i+step] for i in range(0, len(df), step)]
))

# make sure to kill the children
pool.close(); pool.join()

[Skipped the cell's code and loaded variables results from file '/mnt/c/Users/lukew/OneDrive/School/DATA 512 [HCDS - Ethics]/DATA512_A2/api_results.pkl'.]


### Create the final dataset

I flatten the list of batched requests into a single list thne convert them to a dataframe and join with the main. Then, I save the file to disk and upload to a proper file store that tracks provenance as a directed graph.

In [None]:
# flatten the batches into a single list
results = list(itertools.chain.from_iterable(results))

# convert to a dataframe
results = pd.DataFrame(results, columns=['revision_id', 'prediction'])

# join with the main
df_ = df.merge(results, on='revision_id', how='inner')

# verify that each revision id was processed before resetti'ng the variable
assert len(df) == len(df_)
df = df_; del df_

# write out the dataframe and upload to file store with provenance
name = 'data-512-a2.csv'
df.to_csv(name, index=None)

t = syn.setProvenance(
    syn.store(File(name=name, path=name, parent='syn17007608')),
    activity=Activity(
        name='Generate Article Quality Predictions',
        description=\
            'The dataset provides the ability to analyse the number and quality ' +
            'of politician Wikimedia articles across countries. In this activity, I pull ' +
            'two data sources one regarding population data and one with revision ids ' +
            'for those articles. I then use the ORES API to get quality predictions for ' +
            'article',
        used=[
            dict(name='Figshare - Wikimedia Articles', url=figshare),
            dict(name='DropBox - Population', url=dropbox),
            dict(name='ORES', url='https://www.mediawiki.org/wiki/ORES')
        ],
        executed=[
            dict(
                name='hcds-a2-bias.ipynb',
                url='https://github.com/lukeWaninger/hcds-a2-bias/blob/master/hcds-a2-bias.ipynb'
            )
        ]
    )
); del t

df.head()


##################################################
 Uploading file to Synapse storage 
##################################################



### Analysis

In [None]:
# reset the population index to make the following computations more readable
pop.set_index('country', inplace=True)

# group the dataframe by country , count the number of articles in each
# additionally, reset the index inorder to collapse the dataframe back
# and finally, rename the prediction column to what it now represents
table12 = df.loc[:, ['country', 'prediction']]\
    .groupby(['country'])\
    .count()\
    .reset_index()\
    .rename(columns={'prediction':'num_articles'})

# calculate the articles per million as a new column
table12['articles_per_million'] = [
    np.round(t.num_articles/pop.loc[t.country].population, 1)
    for t in table12.itertuples()
]

# sort the table by the number of articles per million 
table12 = table12.sort_values(by='articles_per_million', ascending=False)

# extract the highest and lowest ten countries into separate dataframes
t1 = table12.drop(columns='num_articles').iloc[:10]
t2 = table12.drop(columns='num_articles').iloc[-10:]

t1; t2

In [None]:
# first, extract only the articles deemed high quality by creating a mask 
# then, group by country and count the number of articles in each
# rename the column to what it now represents
# finally, merge with table12 so we have the full count of articles in the same frame
table34 = df.loc[[p in ['GA', 'FA'] for p in df.prediction], ['country', 'prediction']]\
    .groupby(['country'])\
    .count()\
    .reset_index()\
    .rename(columns={'prediction':'num_quality_articles'})\
    .merge(
        table12.drop(columns='articles_per_million'),
        on='country'
    )

# calculate the percent of articles deemed high quality as a new column
table34['percent_quality'] = [
    np.round(t.num_quality_articles/t.num_articles, 3)
    for t in table34.itertuples()
]

# drop the unnecessary columns and sort by the percentage of quality articles
table34 = table34.drop(columns=['num_quality_articles', 'num_articles']).sort_values(by='percent_quality', ascending=False)

# extract the highest and lowest ten into new dataframes
t3 = table34.iloc[:10]
t4 = table34.iloc[-10:]

t3; t4

I visualizate these tables as boxplots to show the extreme skew between countries; just printing a table of the top doesn't give the bias justice. The percentage of articles per million people is skewed so badly that you can only distinguish the outlying countries. The percentage of high quality articles is still quite skewed but not nearly as bad.

In [None]:
# create the first boxplot
h1 = go.Box(
    name='.', 
    x=table12.articles_per_million, 
    text=table12.country, 
    boxpoints='all', 
    jitter=0.3, 
    xaxis='x1',
    marker=dict(color='#2A3D54')
)

# create the second boxplot
h2 = go.Box(
    name='.', 
    x=table34.percent_quality, 
    text=table34.country,  
    boxpoints='all',
    jitter=0.3,
    xaxis='x2',
    marker=dict(color='#B26C10')
)

# generate a figure containing two subplots
fig = tools.make_subplots(
    rows=2, cols=1,
    subplot_titles=[
        'Percentage of Articles Per Million People',
        'Percentage of Articles Deemed High Quality'
    ],
    print_grid=False
)

# append the subplots
fig.append_trace(h1, 1, 1)
fig.append_trace(h2, 2, 1)

# update the layout to not inclue the legend, the titles speak for themselves
fig['layout'].update(showlegend=False)

# show the figure
iplot(fig)