In [1]:
import pandas as pd
import requests
import json
import ast

----

## Cleaning the data

In [2]:
page_data = pd.read_csv('source-data/page_data.csv')
WPDS_2018_data = pd.read_csv('source-data/WPDS_2018_data.csv')

page_data.head()

In [4]:
WPDS_2018_data.head()

Unnamed: 0,Geography,Population mid-2018 (millions)
0,AFRICA,1284.0
1,Algeria,42.7
2,Egypt,97.0
3,Libya,6.5
4,Morocco,35.2


In [5]:
# Dropping the wiki pages starting with 'Template:'

page_data = page_data[~page_data.page.str.startswith('Template:')].reset_index(drop=True)

In [126]:
WPDS_2018_data_region

Unnamed: 0,Geography,Population mid-2018 (millions)
0,AFRICA,1284
1,NORTHERN AMERICA,365
2,LATIN AMERICA AND THE CARIBBEAN,649
3,ASIA,4536
4,EUROPE,746
5,OCEANIA,41


In [141]:
# Separating Countries and region data from WPDS data

WPDS_2018_data_region = WPDS_2018_data[WPDS_2018_data.Geography.str.isupper()].reset_index(drop=True)
# Converting values to float
WPDS_2018_data_region['Population mid-2018 (millions)'] = \
WPDS_2018_data_region['Population mid-2018 (millions)'].str.replace(',', '').astype(float)

WPDS_2018_data_country = WPDS_2018_data[~WPDS_2018_data.Geography.str.isupper()].reset_index(drop=True)

# Since the values for population are in string, we need to convert them to float for later use
WPDS_2018_data_country['Population mid-2018 (millions)'] = \
WPDS_2018_data_country['Population mid-2018 (millions)'].str.replace(',', '').astype(float)

---

### Getting article quality predictions

In [48]:
HEADERS = {'User-Agent' : 'https://github.com/nmnshrma', 'From' : 'namans3@uw.edu'}

def fetch_ores_response(revision_ids, headers):
    """
    fetches ORES response for the ORES API, for a set of revision IDs
    
    :param revision_ids: list of ids to be fetched
    :param headers: HEADERS for the API call
    
    :returns: nested dict object with ORES API response
    """
    
    endpoint = 'https://ores.wikimedia.org/v3/scores/{project}/?models={model}&revids={revids}'
    
    params = {'project' : 'enwiki',
              'model'   : 'wp10',
              'revids'  : '|'.join(str(x) for x in revision_ids)
              }
    api_call = requests.get(endpoint.format(**params))
    response = api_call.json()
    
    return response

In [84]:
def store_and_read(store_loc, action, dict_={}):
    """
    helper function to read and write dict objects to a location
    
    NOTE: The following ONLY writes if there is non-empty dict
    
    :param store_loc: location for the file store/read
    :param action: list of ids to be fetched
    :param dict_: key-value dict to be stored 
    
    :returns: (for read) dict objects
    """
    
    
    if action=='write' and dict_:
        with open(store_loc, 'a+') as f:
            f.write(str(dict_))
        f.close()
        return None
    elif action=='read':
        with open(store_loc, 'r') as f:
            s = f.read()
            dict_ = ast.literal_eval(s)
        f.close()
        return dict_
    else:
        raise ValueError("action value must be read/write")

In [85]:
def fetch_ores_response_batchwise(revision_ids, headers, perc_split, store_loc):
    """
    splits a large set of revision IDs to gather and clean pertinent responses
    from the ORES API
    
    Calls fetch_ores_response method for a fraction of revision IDs. 
    Fraction of revision IDs to be sent are decided through perc_split
    
    :param revision_ids: list of ids to be fetched
    :param headers: HEADERS for the API call
    :param perc_split: fraction of revision_ids to be used
    :param store_loc: the store loc for the dict object to be written/read
    
    :returns: nested dict object with key-value pair
    """
    
    # Shorten the list that have already been read and stored 
    ignore_list = store_and_read(store_loc=store_loc, action='read')
    revision_ids = [i for i in revision_ids if i not in ignore_list] 
    
    # helper values for the API calls
    # Batch size decides the chunk size for the API call
    n = len(revision_ids)
    batch_size = n//perc_split
    
    data_dict = {}
    
    for i in range(0, n, batch_size):
        # sends a batch at once
        data = fetch_ores_response(revision_ids[i:i+batch_size], headers=headers) 
        for key, val in data['enwiki']['scores'].items():
            data_dict[key] = 'NA' if 'error' in val.get('wp10') else val.get('wp10', 'NA').get('score', 'NA').get('prediction', 'NA')
            store_and_read(dict_={key:data_dict[key]}, store_loc=store_loc, action='write')
    
    # Return dict object contains: {Rev_id: Prediction}
    return data_dict

In [None]:
score_map= fetch_ores_response_batchwise(revision_ids=page_data.rev_id,
                                        headers=HEADERS,
                                        perc_split=500,
                                        store_loc='results-data/quality-map.txt')

In [94]:
## Maps the quality score from 'score_map' 

page_data['quality_score'] = page_data.rev_id.map(lambda x: score_map.get(str(x), 'NA'))

----

### Combining the datasets

In [142]:
# STEPWISE Preparation for the data

# Only take articles who have a legitimate quality score
final_page_data = page_data[page_data.quality_score != 'NA']


# Inner join to merge file with country, so as to attach populations
final_page_data = final_page_data.merge(WPDS_2018_data_country, how='inner', 
                                        left_on='country', right_on='Geography')

# Remove redundant columns
final_page_data = final_page_data[['page', 'country', 'rev_id', 'quality_score','Population mid-2018 (millions)']]

# Column rename and reshuffle as per the instructions
final_page_data.rename(columns={"page": "article_name", 
                               "quality_score": "article_quality",
                               "rev_id": "revision_id",
                               "Population mid-2018 (millions)": "population"},
                      inplace = True)
final_page_data = final_page_data[['country', 'article_name', 'revision_id', 'article_quality', 'population']]

final_page_data.to_csv('results-data/wiki_page_merged.csv', index=False)

final_page_data.head()

Unnamed: 0,country,article_name,revision_id,article_quality,population
0,Chad,Bir I of Kanem,355319463,Stub,15.4
1,Chad,Abdullah II of Kanem,498683267,Stub,15.4
2,Chad,Salmama II of Kanem,565745353,Stub,15.4
3,Chad,Kuri I of Kanem,565745365,Stub,15.4
4,Chad,Mohammed I of Kanem,565745375,Stub,15.4


----

### Analysis
Your analysis will consist of calculating the proportion (as a percentage) of articles-per-population and high-quality articles for each country AND for each geographic region. By "high quality" articles, in this case we mean the number of articles about politicians in a given country that ORES predicted would be in either the "FA" (featured article) or "GA" (good article) classes.

#### Examples:
1. if a country has a population of 10,000 people, and you found 10 articles about politicians from that country, then the percentage of articles-per-population would be .1%.
2. if a country has 10 articles about politicians, and 2 of them are FA or GA class articles, then the percentage of high-quality articles would be 20%.


**Results format**
Your results from this analysis will be published in the form of data tables. You are being asked to produce six total tables, that show:

In [243]:
# Generating a helper column that has 1 for good quality articles and 0 for bad

final_page_data.loc[:,'high_quality'] = final_page_data.article_quality.map(lambda x: 
                                                                            1 if x in ['GA', 'FA'] else 0)

In [146]:
final_page_data.dtypes

country             object
article_name        object
revision_id          int64
article_quality     object
population         float64
high_quality         int64
dtype: object

In [210]:
country_group_data = final_page_data.groupby('country').agg({'revision_id':'count', 
                                                             'high_quality':'mean', 
                                                             'population': 'median'}).reset_index()

In [219]:
country_group_data.rename(columns = {'revision_id':'articles'},
                          inplace=True)
country_group_data.head()

Unnamed: 0,country,articles,high_quality,population
0,Afghanistan,320,0.0375,36.5
1,Albania,457,0.006565,2.9
2,Algeria,116,0.017241,42.7
3,Andorra,34,0.0,0.08
4,Angola,106,0.0,30.4


In [220]:
country_group_data.loc[:, 'coverage'] = \
country_group_data.articles/country_group_data.population

In [221]:
country_group_data.sort_values(by='coverage', ascending=False).head(n=10)

Unnamed: 0,country,articles,high_quality,population,coverage
166,Tuvalu,54,0.092593,0.01,5400.0
115,Nauru,52,0.0,0.01,5200.0
135,San Marino,81,0.0,0.03,2700.0
108,Monaco,40,0.0,0.04,1000.0
93,Liechtenstein,28,0.0,0.04,700.0
161,Tonga,63,0.0,0.1,630.0
103,Marshall Islands,37,0.0,0.06,616.666667
68,Iceland,201,0.00995,0.4,502.5
3,Andorra,34,0.0,0.08,425.0
61,Grenada,36,0.027778,0.1,360.0


In [222]:
country_group_data.sort_values(by='coverage', ascending=True).head(n=10)

Unnamed: 0,country,articles,high_quality,population,coverage
69,India,980,0.017347,1371.3,0.71465
70,Indonesia,210,0.047619,265.2,0.791855
34,China,1130,0.036283,1393.8,0.810733
173,Uzbekistan,28,0.071429,32.9,0.851064
51,Ethiopia,101,0.019802,107.5,0.939535
82,"Korea, North",36,0.194444,25.6,1.40625
178,Zambia,25,0.0,17.7,1.412429
159,Thailand,112,0.026786,66.2,1.691843
112,Mozambique,58,0.0,30.5,1.901639
13,Bangladesh,319,0.009404,166.4,1.917067


In [223]:
country_group_data.loc[:, 'relative_quality'] = \
(country_group_data.articles*country_group_data.high_quality)/country_group_data.articles

In [224]:
country_group_data.sort_values(by='relative_quality', ascending=False).head(n=10)

Unnamed: 0,country,articles,high_quality,population,coverage,relative_quality
82,"Korea, North",36,0.194444,25.6,1.40625,0.194444
137,Saudi Arabia,118,0.127119,33.4,3.532934,0.127119
104,Mauritania,48,0.125,4.5,10.666667,0.125
31,Central African Republic,66,0.121212,4.7,14.042553,0.121212
132,Romania,343,0.113703,19.5,17.589744,0.113703
166,Tuvalu,54,0.092593,0.01,5400.0,0.092593
19,Bhutan,33,0.090909,0.8,41.25,0.090909
44,Dominica,12,0.083333,0.07,171.428571,0.083333
155,Syria,128,0.078125,18.3,6.994536,0.078125
18,Benin,91,0.076923,11.5,7.913043,0.076923


In [225]:
country_group_data.sort_values(by='relative_quality', ascending=True).head(n=10)

Unnamed: 0,country,articles,high_quality,population,coverage,relative_quality
143,Slovakia,116,0.0,5.4,21.481481,0.0
114,Namibia,162,0.0,2.5,64.8,0.0
30,Cape Verde,37,0.0,0.6,61.666667,0.0
112,Mozambique,58,0.0,30.5,1.901639,0.0
38,Costa Rica,147,0.0,5.0,29.4,0.0
108,Monaco,40,0.0,0.04,1000.0,0.0
43,Djibouti,37,0.0,1.0,37.0,0.0
107,Moldova,423,0.0,3.5,120.857143,0.0
167,Uganda,185,0.0,44.1,4.195011,0.0
49,Eritrea,16,0.0,6.0,2.666667,0.0


---

In [198]:
WPDS_2018_data.loc[:,'geo_region'] = None

geo_regions = list(WPDS_2018_data_region.Geography)

for idx, geo_region in enumerate(geo_regions):
    i = int(WPDS_2018_data.index[WPDS_2018_data.Geography== geo_region][0])
    
    if geo_region != geo_regions[-1]:
        region_next = geo_regions[idx+1]
        i_next = int(WPDS_2018_data.index[WPDS_2018_data.Geography== region_next][0])
        WPDS_2018_data.loc[i:i_next, 'geo_region'] = geo_region
    else:
        WPDS_2018_data.loc[i:, 'geo_region'] = geo_region
    

In [205]:
country_region_map = dict(list(zip(WPDS_2018_data.Geography,
                                   WPDS_2018_data.geo_region)))

In [207]:
final_page_data.loc[:, 'region'] = final_page_data.country.map(lambda x: country_region_map.get(x,
                                                                                               'None'))

In [237]:
region_group_data = final_page_data.groupby('region').\
agg({'revision_id':'count',
     'high_quality':'mean', 
     'population': 'sum'}).reset_index()

In [238]:
region_group_data
region_group_data.head()

Unnamed: 0,region,revision_id,high_quality,population
0,AFRICA,6851,0.018246,312553.0
1,ASIA,11531,0.026884,3585935.9
2,EUROPE,15864,0.020298,553064.87
3,LATIN AMERICA AND THE CARIBBEAN,5169,0.013349,328578.64
4,NORTHERN AMERICA,1921,0.051536,384943.6


In [239]:
region_group_data.rename(columns = {'revision_id':'articles'}, inplace =True)

In [240]:
region_group_data.loc[:, 'coverage'] = \
region_group_data.articles/country_group_data.population

region_group_data.sort_values(by='coverage', ascending=False).head(n=10)

Unnamed: 0,region,articles,high_quality,population,coverage
3,LATIN AMERICA AND THE CARIBBEAN,5169,0.013349,328578.64,64612.5
5,OCEANIA,3128,0.0211,43095.58,31280.0
1,ASIA,11531,0.026884,3585935.9,3976.206897
2,EUROPE,15864,0.020298,553064.87,371.522248
0,AFRICA,6851,0.018246,312553.0,187.69863
4,NORTHERN AMERICA,1921,0.051536,384943.6,63.190789


In [241]:
region_group_data.loc[:, 'relative_quality'] = \
(region_group_data.articles*region_group_data.high_quality)/region_group_data.articles

region_group_data.sort_values(by='relative_quality', ascending=False).head(n=10)

Unnamed: 0,region,articles,high_quality,population,coverage,relative_quality
4,NORTHERN AMERICA,1921,0.051536,384943.6,63.190789,0.051536
1,ASIA,11531,0.026884,3585935.9,3976.206897,0.026884
5,OCEANIA,3128,0.0211,43095.58,31280.0,0.0211
2,EUROPE,15864,0.020298,553064.87,371.522248,0.020298
0,AFRICA,6851,0.018246,312553.0,187.69863,0.018246
3,LATIN AMERICA AND THE CARIBBEAN,5169,0.013349,328578.64,64612.5,0.013349


---

Writeup: reflections and implications
Write a few paragraphs, either in the README or at the end of the notebook, reflecting on what you have learned, what you found, what (if anything) surprised you about your findings, and/or what theories you have about why any biases might exist (if you find they exist). You can also include any questions this assignment raised for you about bias, Wikipedia, or machine learning.

In addition to any reflections you want to share about the process of the assignment, please respond (briefly) to at least three of the questions below:

**What biases did you expect to find in the data (before you started working with it), and why?**

**What (potential) sources of bias did you discover in the course of your data processing and analysis?**

**What might your results suggest about (English) Wikipedia as a data source?**