In [176]:
import os, sys, requests, json
import numpy as np
import pandas as pd

from IPython.display import display, HTML

In [173]:
%%HTML
<style type="text/css">
    table.dataframe td, table.dataframe th {
        border-style: solid;
    }
</style>

1. Import and preprocess article and population data
---

In [110]:
"""
The data files are downloaded in orig/
Utilize the pd.read_csv methods to import the datasets into DataFrames
"""
page_df = pd.read_csv('orig/page_data.csv')
population_df = pd.read_csv('orig/WPDS_2018_data.csv')

In [112]:
"""
Preprocessing steps for page_df:
    - Lower-case country to match with population_df later
    - Remove filler words for page tittles 
"""
page_df.loc[:, 'country'] = page_df.country.str.lower()
page_df.loc[:, 'page'] = page_df.page.str.replace('Template:', '')
page_df.head()

Unnamed: 0,page,country,rev_id
0,ZambiaProvincialMinisters,zambia,235107991
1,Bir I of Kanem,chad,355319463
2,Zimbabwe-politician-stub,zimbabwe,391862046
3,Uganda-politician-stub,uganda,391862070
4,Namibia-politician-stub,namibia,391862409


In [113]:
"""
Preprocessing steps for population_df:
    - Lowercase 'Geography' column to match with 'country' column of page_df
    - Rename column 'Population...' to 'population'
    - Clean for '1,300' to '1300' and convert to float
"""
population_df.loc[:, 'Geography'] = population_df.Geography.str.lower()
population_df.rename(columns={'Population mid-2018 (millions)': 'population'}, inplace=True)
population_df.loc[:, 'population'] = population_df['population'].apply(lambda x: float(x.replace(',', '')))

In [114]:
population_df.head()

Unnamed: 0,Geography,population
0,africa,1284.0
1,algeria,42.7
2,egypt,97.0
3,libya,6.5
4,morocco,35.2


In [118]:
"""
Comments:
    - There are 47,197 rows for page_df about articles
    - There are 207 countries with population in population_df
"""
print(page_df.info())
print(population_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47197 entries, 0 to 47196
Data columns (total 3 columns):
page       47197 non-null object
country    47197 non-null object
rev_id     47197 non-null int64
dtypes: int64(1), object(2)
memory usage: 1.1+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207 entries, 0 to 206
Data columns (total 2 columns):
Geography     207 non-null object
population    207 non-null float64
dtypes: float64(1), object(1)
memory usage: 3.3+ KB
None


2. Retrieve article quality predictions
---
* The section goal is to retrieve the predicted quality scores for each Wikipedia article
* The next code snippet to query [ORES (Objective Revision Evaluation Service) API](https://www.mediawiki.org/wiki/ORES) is sourced from https://github.com/Ironholds/data-512-a2/blob/master/hcds-a2-bias_demo.ipynb under the umbrella license mentioned in `README.md`

In [56]:
headers = {
    'User-Agent' : 'https://github.com/lmtoan', 
    'From' : 'toanlm@uw.edu'
}

def get_ores_data(revision_ids, headers):
    """ 
    Originally sourced from https://github.com/Ironholds/data-512-a2/blob/master/hcds-a2-bias_demo.ipynb
    """
    # Define the endpoint
    endpoint = 'https://ores.wikimedia.org/v3/scores/{project}/?models={model}&revids={revids}'
    
    # Specify the parameters - smushing all the revision IDs together separated by | marks.
    # Yes, 'smush' is a technical term, trust me I'm a scientist.
    # What do you mean "but people trusting scientists regularly goes horribly wrong" who taught you tha- oh.  
    params = {'project' : 'enwiki',
              'model'   : 'wp10',
              'revids'  : '|'.join(str(x) for x in revision_ids)
              }
    api_call = requests.get(endpoint.format(**params))
    response = api_call.json()
    return response

In [79]:
"""
The retrieve_ratings method perform the following:
    - Spilt the page_df into sub_df of certain batch size (default at 50)
    - Retrieve the rev_id for each sub_df and call the ORES API for json result
    - Filter relevant result and skip rev_id with errors
    - Extrapolate the result to appropriate columns including 'prediction' and probability scores
    for each class
"""
from tqdm import tqdm_notebook
def retrieve_ratings(df, batch_size=50):
    df_collection = []
    for start_point in tqdm_notebook(range(0, len(df), batch_size)):
        end_point = start_point + batch_size
        if end_point > len(df):
            end_point = len(df)
        sub_df = df[start_point:end_point]
        rev_ids = sub_df.loc[:, 'rev_id'].values
        resp = get_ores_data(rev_ids, headers)
        for rid in rev_ids:
            metadata = resp['enwiki']['scores'][str(rid)]['wp10'].get('score', None)
            if metadata is not None:
                df_collection.append(pd.DataFrame({
                    'rev_id': rid,
                    'prediction': metadata['prediction'],
                    'probability_B': metadata['probability']['B'],
                    'probability_C': metadata['probability']['C'],
                    'probability_FA': metadata['probability']['FA'],
                    'probability_GA': metadata['probability']['GA'],
                    'probability_Start': metadata['probability']['Start'],
                    'probability_Stub': metadata['probability']['Stub']
                }, index=[0]))     
    return pd.concat(df_collection)

main_df = retrieve_ratings(page_df)




3. Combining the datasets
---
After merging the ORES result (`main_df`) with`page_df`, and mapping population information with `population_df`, 44,973 rows are with valid return entries from ORES API out of 47,197 rows of `page_df`.

Already filtered for `NaN` during querying ORES API.

In [139]:
merge_df = main_df.merge(page_df, on='rev_id').merge(population_df, left_on='country', right_on='Geography').drop(['Geography'], axis=1)
merge_df.rename(columns={
    'rev_id': 'revision_id',
    'page': 'article_name',
    'prediction': 'article_quality',
}, inplace=True)
print(merge_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44973 entries, 0 to 44972
Data columns (total 11 columns):
article_quality      44973 non-null object
probability_B        44973 non-null float64
probability_C        44973 non-null float64
probability_FA       44973 non-null float64
probability_GA       44973 non-null float64
probability_Start    44973 non-null float64
probability_Stub     44973 non-null float64
revision_id          44973 non-null int64
article_name         44973 non-null object
country              44973 non-null object
population           44973 non-null float64
dtypes: float64(7), int64(1), object(3)
memory usage: 4.1+ MB
None


In [140]:
merge_df.sample(3)

Unnamed: 0,article_quality,probability_B,probability_C,probability_FA,probability_GA,probability_Start,probability_Stub,revision_id,article_name,country,population
44188,Start,0.042147,0.113943,0.002605,0.0055,0.773462,0.062343,755761395,Fazıl Önder,cyprus,1.2
29585,Stub,0.005747,0.010121,0.00081,0.002486,0.032476,0.948361,763711224,James Shanks,new zealand,4.9
9668,Stub,0.012138,0.018977,0.001393,0.002762,0.317063,0.647667,721515580,José Eugenio Tello,argentina,44.5


In [141]:
merge_df.to_csv('final_data.csv') # Save data to main repo

4. Analysis
---

### Calculate the proportion of politician articles-per-population for each country

In [155]:
"""
Steps:
    - Group by country and population and count number of articles
    - Divide the num_article by population
"""
articles_per_pop_df = merge_df.groupby(['country', 'population'])['revision_id'].count().reset_index().rename(columns={'revision_id': 'num_article'})
articles_per_pop_df['articles_per_population_perc'] = articles_per_pop_df['num_article'] / (articles_per_pop_df['population'] * 10**6) * 100.0

### "10 highest-ranked countries in terms of number of politician articles as a proportion of country population"

In [199]:
HTML(articles_per_pop_df.sort_values(by='articles_per_population_perc', ascending=False).head(10).to_html(index=False))

country,population,num_article,articles_per_population_perc
tuvalu,0.01,55,0.55
nauru,0.01,53,0.53
san marino,0.03,82,0.273333
monaco,0.04,40,0.1
liechtenstein,0.04,29,0.0725
tonga,0.1,63,0.063
marshall islands,0.06,37,0.061667
iceland,0.4,206,0.0515
andorra,0.08,34,0.0425
federated states of micronesia,0.1,38,0.038


### "10 lowest-ranked countries in terms of number of politician articles as a proportion of country population"

In [198]:
HTML(articles_per_pop_df.sort_values(by='articles_per_population_perc', ascending=True).head(10).to_html(index=False))

country,population,num_article,articles_per_population_perc
india,1371.3,986,7.2e-05
indonesia,265.2,214,8.1e-05
china,1393.8,1135,8.1e-05
uzbekistan,32.9,29,8.8e-05
ethiopia,107.5,105,9.8e-05
zambia,17.7,25,0.000141
"korea, north",25.6,39,0.000152
thailand,66.2,112,0.000169
bangladesh,166.4,323,0.000194
mozambique,30.5,60,0.000197


### Calculate the proportion of high-quality articles for each country

In [177]:
"""
Steps:
    - Create high_quality_df only filter for article rows with FA and HA ratings
    - Group by country of quality_df to obtain num_high_quality_article
    - Group by country of the master merge_df to obtain num_article
    - Divide num_high_quality_article by num_article to get proportions
"""
high_quality_df = merge_df[(merge_df['article_quality'] == 'FA') | (merge_df['article_quality'] == 'GA')]

In [186]:
summary_df = high_quality_df.groupby(['country'])['revision_id'].count().reset_index().rename(columns={'revision_id': 'num_high_quality_article'}).merge(merge_df.groupby(['country'])['revision_id'].count().reset_index().rename(columns={'revision_id': 'num_article'}), on='country')
summary_df['high_quality_perc'] = summary_df['num_high_quality_article'] / summary_df['num_article'] * 100.0

### "10 highest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles about politicians from that country"

In [197]:
HTML(summary_df.sort_values(by='high_quality_perc', ascending = False).head(10).to_html(index=False))

country,num_high_quality_article,num_article,high_quality_perc
"korea, north",7,39,17.948718
saudi arabia,16,119,13.445378
central african republic,8,68,11.764706
romania,40,348,11.494253
mauritania,5,52,9.615385
bhutan,3,33,9.090909
tuvalu,5,55,9.090909
dominica,1,12,8.333333
united states,82,1092,7.509158
benin,7,94,7.446809


### "10 lowest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles about politicians from that country"

In [196]:
HTML(summary_df.sort_values(by='high_quality_perc', ascending = True).head(10).to_html(index=False))

country,num_high_quality_article,num_article,high_quality_perc
tanzania,1,408,0.245098
peru,1,354,0.282486
lithuania,1,248,0.403226
nigeria,3,682,0.439883
morocco,1,208,0.480769
fiji,1,199,0.502513
bolivia,1,187,0.534759
brazil,3,551,0.544465
luxembourg,1,180,0.555556
sierra leone,1,166,0.60241


### !!! CONTINUE TO `README.md` FOR DETAILED WRITE-UP !!!