# DATA 512 Homework 2: Considering Bias in Data

### Step 1: Getting the article and population data

In [4]:
import pandas as pd
# These are standard python modules
import json, time, urllib.parse
#
# The 'requests' module is not a standard Python module. You will need to install this with pip/pip3 if you do not already have it
import requests


poli = pd.read_csv("../raw_data/poli_clean.csv") # article names, Wikipedia url, and country
pop = pd.read_csv("../raw_data/pop_by_country.csv") # Geography (i.e. countries, regions, and sub-regions), population (millions)

### Data Cleaning

In [5]:
poli[poli.duplicated(['name', 'url', 'country'], keep = 'first')]

Unnamed: 0,name,url,country
6295,Abdirahman Aw Ali Farrah,https://en.wikipedia.org/wiki/Abdirahman_Aw_Al...,Somalia
6309,Ibrahim Megag Samatar,https://en.wikipedia.org/wiki/Ibrahim_Megag_Sa...,Somalia


There are 2 unique instances that have the same name, url, and country:    
Abdirahman Aw Ali Farrah, Ibrahim Megag Samatar

Here we create a new politician dataframe containing no duplicates using the latest records:

In [6]:
poli_clean = poli[~poli.duplicated(['name', 'url', 'country'], keep = 'last')]

### Step 2: Getting Article Quality Predictions

Here we set up the templates for the information we want to retrieve from the API and load our cleaned politician dataset to use as the article titles.

In [7]:
#########
#
#    CONSTANTS
#

# The basic English Wikipedia API endpoint
API_ENWIKIPEDIA_ENDPOINT = "https://en.wikipedia.org/w/api.php"

# We'll assume that there needs to be some throttling for these requests - we should always be nice to a free data resource
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making automated requests we should include something that is unique to the person making the request
# This should include an email - your UW email would be good to put in there
REQUEST_HEADERS = {
    'User-Agent': 'mh808@uw.edu, University of Washington, MSDS DATA 512 - AUTUMN 2022',
}

# This is just a list of English Wikipedia article titles that we can use for example requests
ARTICLE_TITLES = poli_clean["name"]

# This is a string of additional page properties that can be returned see the Info documentation for
# what can be included. If you don't want any this can simply be the empty string
PAGEINFO_EXTENDED_PROPERTIES = "talkid|url|watched|watchers"
#PAGEINFO_EXTENDED_PROPERTIES = ""

# This template lists the basic parameters for making this
PAGEINFO_PARAMS_TEMPLATE = {
    "action": "query",
    "format": "json",
    "titles": "",           # to simplify this should be a single page title at a time
    "prop": "info",
    "inprop": PAGEINFO_EXTENDED_PROPERTIES
}

This function helps us to retrieve the page information per article. We are specifically interested in using this function to obtain 'revid.'

In [8]:
#
#    PROCEDURES/FUNCTIONS
#

def request_pageinfo_per_article(article_title = None, 
                                 endpoint_url = API_ENWIKIPEDIA_ENDPOINT, 
                                 request_template = PAGEINFO_PARAMS_TEMPLATE,
                                 headers = REQUEST_HEADERS):
    # Make sure we have an article title
    if not article_title: return None
    
    request_template['titles'] = article_title
        
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or any other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(endpoint_url, headers=headers, params=request_template)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response['query']['pages']

Here we are requesting article page information and storing the revid in a dictionary with the article name being the associated key (takes about 30 minutes). Prints out article titles that are not found as of 10/13/2022 3:30pm.

In [9]:
article_info = {}
for person in ARTICLE_TITLES:
    info = request_pageinfo_per_article(person)
    if 'lastrevid' in info[list(info.keys())[0]]:
        article_info[person] = info[list(info.keys())[0]]['lastrevid']
    else:
        print(person + " not found.")
article_info

Prince Ofosu Sefah not found.
Harjit Kaur Talwandi not found.
Abd al-Razzaq al-Hasani not found.
Abiodun Abimbola Orekoya not found.
Roman Konoplev not found.


{'Shahjahan Noori': 1099689043,
 'Abdul Ghafar Lakanwal': 943562276,
 'Majah Ha Adrif': 852404094,
 'Haroon al-Afghani': 1095102390,
 'Tayyab Agha': 1104998382,
 'Ahmadullah Wasiq': 1109361754,
 'Aziza Ahmadyar': 1087211008,
 'Muqadasa Ahmadzai': 1082489593,
 'Mohammad Sarwar Ahmedzai': 1038918070,
 'Amir Muhammad Akhundzada': 1069322182,
 'Nasrullah Baryalai Arsalai': 1095526840,
 'Mohammad Asim Asim': 1013838830,
 'Atiqullah Atifmal': 1112407669,
 'Abdul Rahim Ayoubi': 1108886061,
 'Alhaj Mutalib Baig': 1111494041,
 'Ismael Balkhi': 1112534409,
 'Abdul Baqi Turkistani': 889226470,
 'Mohammad Ghous Bashiri': 1102150221,
 'Abas Basir': 1098419766,
 'Jan Baz': 997027082,
 'Ahmad Behzad': 1103948295,
 'Bashir Ahmad Bezan': 1060707209,
 'Rafiullah Bidar': 977208323,
 'Mohammad Siddiq Chakari': 1105913099,
 'Cheragh Ali Cheragh': 1087211968,
 'Nasir Ahmad Durrani': 988838315,
 'Elay Ershad': 1102489654,
 'Muhammad Hashim Esmatullahi': 949986748,
 'Ezatullah (Nangarhar)': 947885788,
 'Aimal

Here we set up the templates to obtain the ORES score information from the API.
We also pass in article_info into ARTICLE_REVISIONS.

In [10]:
#########
#
#    CONSTANTS
#

# The current ORES API endpoint
API_ORES_SCORE_ENDPOINT = "https://ores.wikimedia.org/v3"
# A template for mapping to the URL
API_ORES_SCORE_PARAMS = "/scores/{context}/{revid}/{model}"

# Use some delays so that we do not hammer the API with our requests
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making automated requests we should include something that is unique to the person making the request
# This should include an email - your UW email would be good to put in there
REQUEST_HEADERS = {
    'User-Agent': 'mh808@uw.edu, University of Washington, MSDS DATA 512 - AUTUMN 2022'
}

# A dictionary of English Wikipedia article titles (keys) and sample revision IDs that can be used for this ORES scoring example
ARTICLE_REVISIONS = article_info

# This template lists the basic parameters for making an ORES request
ORES_PARAMS_TEMPLATE = {
    "context": "enwiki",        # which WMF project for the specified revid
    "revid" : "",               # the revision to be scored - this will probably change each call
    "model": "articlequality"   # the AI/ML scoring model to apply to the reviewion
}

This function helps us to retrieve the ORES score per article. We are specifically interested in finding the predicted quality score per article.

In [11]:
def request_ores_score_per_article(article_revid = None, 
                                   endpoint_url = API_ORES_SCORE_ENDPOINT, 
                                   endpoint_params = API_ORES_SCORE_PARAMS, 
                                   request_template = ORES_PARAMS_TEMPLATE,
                                   headers = REQUEST_HEADERS,
                                   features=False):
    # Make sure we have an article revision id
    if not article_revid: return None
    
    # set the revision id into the template
    request_template['revid'] = article_revid
    
    # now, create a request URL by combining the endpoint_url with the parameters for the request
    request_url = endpoint_url+endpoint_params.format(**request_template)
    
    # the features used by the ML model can sometimes be returned as well as scores
    if features:
        request_url = request_url+"?features=true"
    
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like ORES - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response['enwiki']['scores']

Retrieving ORES score information for each article and converting the output to a dataframe (also takes about 30 minutes).

In [23]:
score_info = {}
for person, revid in ARTICLE_REVISIONS.items():
   result = request_ores_score_per_article(revid)
   for key in list(result.keys()):
      score_info[person] = result[key]["articlequality"]["score"]["prediction"]

score_df = pd.DataFrame.from_dict(score_info, orient = "index", columns = ["prediction"])
score_df.reset_index(inplace = True)
score_df.rename({"index": "name"}, axis=1)

Unnamed: 0,name,prediction
0,Shahjahan Noori,GA
1,Abdul Ghafar Lakanwal,Start
2,Majah Ha Adrif,Start
3,Haroon al-Afghani,B
4,Tayyab Agha,Start
...,...,...
7524,Rekayi Tangwena,Stub
7525,Josiah Tongogara,C
7526,Langton Towungana,Stub
7527,Herbert Ushewokunze,Stub


### Step 3: Combining the Datasets

Here we create a dictionary to map regions to countries then create a list to add as a column into the final dataframe. Additionally, we create a list for all countries in the population dataset.

In [24]:
art_revid = pd.DataFrame.from_dict(ARTICLE_REVISIONS, orient = "index", columns = ["revid"])
art_revid.reset_index(inplace = True)

In [26]:
region_dict = {}
region = ''
country_list = []
for i in pop["Geography"]:
    if i.isupper():
        region = i
        region_dict[i] = []
    else:
        region_dict[region] += [i]
        country_list.append(i)

region_list = []
for i in poli_clean["country"]:
    flag = False
    for key, val in region_dict.items():
        if i in val:
            region_list.append(key)
            flag = True
    if not flag:
        region_list.append('None') #append None as a placeholder

poli_clean["region"] = region_list #adding region as a column into the poli_clean dataframe

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Here we are finding countries that don't have a match and add them to a list called "missing_countries."

In [27]:
match = pd.merge(poli_clean, pop, how = 'outer', left_on = "country", right_on = "Geography",indicator = True)

left_only = match[match["_merge"]=="left_only"]
right_only = match[match["_merge"]=="right_only"]

missing_countries = []
left_only_unique = left_only.groupby("country")['country'].unique()

for i in left_only_unique:
    missing_countries.append(i[0])

for i in right_only["Geography"]:
    if i in country_list:
        missing_countries.append(i)

Writing to a text file the missing countries.

In [450]:
with open('wp_countries-no_match.txt', 'w') as f:
    f.write("\n".join(missing_countries))

Setting up the final dataframe and outputting csv.

In [28]:
poli_score = poli_clean.merge(score_df, how = "inner", left_on = "name", right_on = "index")
poli_score_revid = pd.merge(poli_score, art_revid, how = 'inner', left_on = ['name'], right_on = ['index'])
poli_score_revid_region = pd.merge(poli_score_revid, pop, how = 'inner', left_on = ['country'], right_on ="Geography")
poli_score_revid_region = poli_score_revid_region.drop(["index_x", "index_y", "Geography", "url"], axis = 1)
df = poli_score_revid_region.dropna(axis = 0).rename({"name": "article_name", "prediction": "article_quality", "revid": "revision_id", "Population (millions)": "population"}, axis = 1)
df.to_csv('./outputs/wp_politicians_by_country.csv')


Unnamed: 0,article_name,country,region,article_quality,revision_id,population
0,Shahjahan Noori,Afghanistan,SOUTH ASIA,GA,1099689043,41.1
1,Abdul Ghafar Lakanwal,Afghanistan,SOUTH ASIA,Start,943562276,41.1
2,Majah Ha Adrif,Afghanistan,SOUTH ASIA,Start,852404094,41.1
3,Haroon al-Afghani,Afghanistan,SOUTH ASIA,B,1095102390,41.1
4,Tayyab Agha,Afghanistan,SOUTH ASIA,Start,1104998382,41.1
...,...,...,...,...,...,...
7502,Rekayi Tangwena,Zimbabwe,EASTERN AFRICA,Stub,1073818982,16.3
7503,Josiah Tongogara,Zimbabwe,EASTERN AFRICA,C,1106932400,16.3
7504,Langton Towungana,Zimbabwe,EASTERN AFRICA,Stub,904246837,16.3
7505,Herbert Ushewokunze,Zimbabwe,EASTERN AFRICA,Stub,959111842,16.3


### Step 4 and 5: Analysis and Results

For the purposes of our analysis, we will exclude countries that have a population of 0. We calculate the articles per capita by obtaining the country counts and dividing it by each country's mean population.

1. Top 10 countries by coverage: The 10 countries with the highest total articles per capita (in descending order)

In [35]:
df_clean = df[df["population"]>0]

cov_countries = df_clean.groupby(['country'])['country'].count().div(df_clean.groupby(['country'])['population'].mean() * 1000000)
cov_countries.sort_values(ascending = False)[0:10].to_frame(name = ("total_articles_per_capita"))

Unnamed: 0_level_0,total_articles_per_capita
country,Unnamed: 1_level_1
Antigua and Barbuda,0.00017
Federated States of Micronesia,0.00013
Andorra,0.0001
Barbados,9.3e-05
Marshall Islands,9e-05
Seychelles,6e-05
Montenegro,6e-05
Luxembourg,5.3e-05
Bhutan,5.1e-05
Grenada,5e-05


2. Bottom 10 countries by coverage: The 10 countries with the lowest total articles per capita (in ascending order

In [37]:
bot_10_countries = cov_countries.sort_values(ascending = True)[0:10]
bot_10_countries.to_frame(name = "total_articles_per_capita")

Unnamed: 0_level_0,total_articles_per_capita
country,Unnamed: 1_level_1
China,1.392176e-09
Mexico,7.843137e-09
Saudi Arabia,8.174387e-08
Romania,1.052632e-07
India,1.255998e-07
Sri Lanka,1.339286e-07
Egypt,1.352657e-07
Ethiopia,2.025932e-07
Taiwan,2.155172e-07
Vietnam,2.716298e-07


3. Top 10 countries by high quality: The 10 countries with the highest high quality articles per capita (in descending order). We filter on 'FA' and 'GA' ratings since we only want to evaluate 'high' ratings.

In [39]:
df_hq = df_clean[df_clean['article_quality'].isin(['FA', 'GA'])]

hq_countries = df_hq.groupby(['country'])['country'].count().div(df_hq.groupby(['country'])['population'].mean() * 1000000)
top_10_hq = hq_countries.sort_values(ascending = False)[0:10]
top_10_hq.to_frame(name = "high_quality_articles_per_capita")


Unnamed: 0_level_0,high_quality_articles_per_capita
country,Unnamed: 1_level_1
Andorra,2e-05
Montenegro,5e-06
Albania,2.142857e-06
Suriname,1.666667e-06
Bosnia-Herzegovina,1.470588e-06
Lithuania,1.071429e-06
Croatia,1.052632e-06
Slovenia,9.52381e-07
Palestinian Territory,9.259259e-07
Gabon,8.333333e-07


4. Bottom 10 countries by high quality: The 10 countries with the lowest high quality articles per capita (in ascending order)

In [40]:
bot_10_hq = hq_countries.sort_values(ascending = True)[0:10]
bot_10_hq.to_frame(name = "high_quality_articles_per_capita")

Unnamed: 0_level_0,high_quality_articles_per_capita
country,Unnamed: 1_level_1
India,4.2337e-09
Thailand,1.497006e-08
Japan,1.601281e-08
Nigeria,1.830664e-08
Vietnam,2.012072e-08
Colombia,2.03666e-08
Uganda,2.118644e-08
Pakistan,2.120441e-08
Sudan,2.132196e-08
Iran,2.257336e-08


5. Geographic regions by total coverage: A rank ordered list of geographic regions (in descending order) by total articles per capita. We add a column for region population for the next two tasks.

In [44]:
df_reg = df[df["region"]!="None"]

df_reg = df_reg.merge(pop, how = "inner", left_on = "region", right_on = "Geography")

df_reg.rename(columns = {"Population (millions)": "region_population", "population": "country_population"}, inplace = True)

cov_region = df_reg.groupby(['region'])['region'].count().div(df_reg.groupby(['region'])['region_population'].mean() * 1000000)
cov_region.sort_values(ascending = False).to_frame(name = "total_articles_per_capita")


Unnamed: 0_level_0,total_articles_per_capita
region,Unnamed: 1_level_1
SOUTHERN EUROPE,5.89404e-06
CARIBBEAN,4.568182e-06
WESTERN EUROPE,3.548223e-06
EASTERN EUROPE,2.560976e-06
NORTHERN EUROPE,2.448598e-06
WESTERN ASIA,2.333333e-06
OCEANIA,1.954545e-06
SOUTHERN AFRICA,1.710145e-06
EASTERN AFRICA,1.369979e-06
CENTRAL ASIA,1.358974e-06


6. Geographic regions by high quality coverage: Rank ordered list of geographic regions (in descending order) by high quality articles per capita

In [47]:
df_reg_hq = df_reg[df_reg['article_quality'].isin(['FA', 'GA'])]
hq_region = df_reg_hq.groupby(['region'])['region'].count().div(df_reg_hq.groupby(['region'])['region_population'].mean() * 1000000)
hq_region.sort_values(ascending = False).to_frame(name = "high_quality_articles_per_capita")

Unnamed: 0_level_0,high_quality_articles_per_capita
region,Unnamed: 1_level_1
SOUTHERN EUROPE,3.046358e-07
CARIBBEAN,1.818182e-07
EASTERN EUROPE,1.324042e-07
WESTERN EUROPE,1.116751e-07
WESTERN ASIA,9.52381e-08
NORTHERN EUROPE,7.476636e-08
SOUTHERN AFRICA,5.797101e-08
CENTRAL AMERICA,5.617978e-08
OCEANIA,4.545455e-08
CENTRAL ASIA,3.846154e-08
