In [2]:
import json, time, urllib.parse
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import config
%matplotlib inline

Step 1: Getting the Article and Population Data
- Politician Data : politicians_by_country_AUG.2024.csv gathered by crawling Wikipedia Category:Politicians by nationality to generate a list of Wikipedia article pages about politicians from a wide range of countries
- Population Data : population_by_country_AUG.2024.csv downloaded from the world population data sheet published by the Population Reference Bureau.

In [3]:
# Reading csvs into dataframes 

pol_df = pd.read_csv("politicians_by_country_AUG.2024.csv")
pop_df = pd.read_csv("population_by_country_AUG.2024.csv")

In [4]:
# Examining dataframes
pol_df.head(3)

Unnamed: 0,name,url,country
0,Majah Ha Adrif,https://en.wikipedia.org/wiki/Majah_Ha_Adrif,Afghanistan
1,Haroon al-Afghani,https://en.wikipedia.org/wiki/Haroon_al-Afghani,Afghanistan
2,Tayyab Agha,https://en.wikipedia.org/wiki/Tayyab_Agha,Afghanistan


In [5]:
pol_df.describe() # Insights: There are only 7111 unique titles and hence 7111 unique urls, everything else is a repeat of some sort.

Unnamed: 0,name,url,country
count,7155,7155,7155
unique,7111,7111,169
top,Torokul Dzhanuzakov,https://en.wikipedia.org/wiki/Torokul_Dzhanuzakov,Nigeria
freq,4,4,246


In [6]:
pop_df.head(3)

Unnamed: 0,Geography,Population
0,WORLD,8009.0
1,AFRICA,1453.0
2,NORTHERN AFRICA,256.0


In [7]:
pop_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Population,233.0,135.672532,651.513712,0.0,1.8,9.8,44.0,8009.0


In [7]:
pol_df[pol_df['name'] == 'Torokul Dzhanuzakov'] # Exploring entry duplicates based on describe() results

Unnamed: 0,name,url,country
3451,Torokul Dzhanuzakov,https://en.wikipedia.org/wiki/Torokul_Dzhanuzakov,Kazakhstan
3704,Torokul Dzhanuzakov,https://en.wikipedia.org/wiki/Torokul_Dzhanuzakov,Kyrgyzstan
6504,Torokul Dzhanuzakov,https://en.wikipedia.org/wiki/Torokul_Dzhanuzakov,Tajikistan
6937,Torokul Dzhanuzakov,https://en.wikipedia.org/wiki/Torokul_Dzhanuzakov,Uzbekistan


In [8]:
# Checking for duplicate entries

duplicate_rows_1 = pol_df[pol_df.duplicated(subset=['name','url'], keep=False)]
duplicate_rows_2 = pol_df[pol_df.duplicated(subset=['name','url','country'], keep=False)]
if not duplicate_rows_1.empty:
    print("Duplicate entries found with same names and urls but different countries")
else:
    print("No duplicate entries found.")
if not duplicate_rows_2.empty:
    print("Completely identical entries found")
else:
    print("Completely identical entries do not exist.")

Duplicate entries found with same names and urls but different countries
Completely identical entries do not exist.


Step 2: Getting Article Quality Predictions

2.1 - Get revision IDs using MediaWiki Action API to get the latest version of the politician's Wikipedia page 
2.2 - Use the title and revision ID to query the ORES API and return the page quality 

Note: Since there are repeated urls in the politicians datframe, we will use a downsized version of the dataframe so as to not query the APIs for the same urls and save time and resources.

In [9]:
small_pol_df = pol_df.drop_duplicates(subset=['url'],keep='first')
print(small_pol_df.shape)

(7111, 3)


In [15]:
#########
#
#    CONSTANTS
#

# The basic English Wikipedia API endpoint
API_ENWIKIPEDIA_ENDPOINT = "https://en.wikipedia.org/w/api.php"
API_HEADER_AGENT = 'User-Agent'

# We'll assume that there needs to be some throttling for these requests - we should always be nice to a free data resource
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making automated requests we should include something that is unique to the person making the request
# This should include an email - your UW email would be good to put in there
REQUEST_HEADERS = {
    'User-Agent': '<manyac@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2024'
}

# This is just a list of English Wikipedia article titles that we can use for example requests
ARTICLE_TITLES = list(small_pol_df['name'])

# This is a string of additional page properties that can be returned see the Info documentation for
# what can be included. If you don't want any this can simply be the empty string
# PAGEINFO_EXTENDED_PROPERTIES = "talkid|url|watched|watchers"
PAGEINFO_EXTENDED_PROPERTIES = ""

# This template lists the basic parameters for making this
PAGEINFO_PARAMS_TEMPLATE = {
    "action": "query",
    "format": "json",
    "titles": "",           # to simplify this should be a single page title at a time
    "prop": "info",
    "inprop": PAGEINFO_EXTENDED_PROPERTIES
}

In [16]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_pageinfo_per_article(article_title = None, 
                                 endpoint_url = API_ENWIKIPEDIA_ENDPOINT, 
                                 request_template = PAGEINFO_PARAMS_TEMPLATE,
                                 headers = REQUEST_HEADERS):
    
    # article title can be as a parameter to the call or in the request_template
    if article_title:
        request_template['titles'] = article_title

    if not request_template['titles']:
        raise Exception("Must supply an article title to make a pageinfo request.")

    if API_HEADER_AGENT not in headers:
        raise Exception(f"The header data should include a '{API_HEADER_AGENT}' field that contains your UW email address.")

    if 'uwnetid@uw' in headers[API_HEADER_AGENT]:
        raise Exception(f"Use your UW email address in the '{API_HEADER_AGENT}' field.")

    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or any other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(endpoint_url, headers=headers, params=request_template)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

In [18]:
# Gather revision IDs for all titles

ores_dict = {}
unable_to_collect = {}
for politician in ARTICLE_TITLES:
    # print(f"Getting page info data for: {politician}")
    info = request_pageinfo_per_article(politician)
    for k,v in info['query']['pages'].items():
        # print(v['title'] == politician)
        try:
            ores_dict[v['title']] = v['lastrevid']
        except Exception as e:
            unable_to_collect[v['title']] = e
            continue

In [19]:
len(ores_dict), len(unable_to_collect)

(7103, 8)

In [38]:
# List of politicians without appropriate "lastrevid" field
list(unable_to_collect.keys())

['Barbara Eibinger-Miedl',
 'Mehrali Gasimov',
 'Kyaw Myint',
 'André Ngongang Ouandji',
 'Tomás Pimentel',
 'Richard Sumah',
 "Segun ''Aeroland'' Adewale",
 'Bashir Bililiqo']

2.2 - Use latest revision ID and article title to get article quality estimated through the ORES ML tool API

In [45]:
#########
#
#    CONSTANTS
#

#    The current LiftWing ORES API endpoint and prediction model
#
API_ORES_LIFTWING_ENDPOINT = "https://api.wikimedia.org/service/lw/inference/v1/models/{model_name}:predict"
API_ORES_EN_QUALITY_MODEL = "enwiki-articlequality"

#
#    The throttling rate is a function of the Access token that you are granted when you request the token. The constants
#    come from dissecting the token and getting the rate limits from the granted token. An example of that is below.
#
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = ((60.0*60.0)/5000.0)-API_LATENCY_ASSUMED  # The key authorizes 5000 requests per hour

#    When making automated requests we should include something that is unique to the person making the request
#    This should include an email - your UW email would be good to put in there
#    
#    Because all LiftWing API requests require some form of authentication, you need to provide your access token
#    as part of the header too
#
REQUEST_HEADER_TEMPLATE = {
    'User-Agent': "<manyac@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2024",
    'Content-Type': 'application/json',
    'Authorization': "Bearer {access_token}"
}
#
#    This is a template for the parameters that we need to supply in the headers of an API request
#
REQUEST_HEADER_PARAMS_TEMPLATE = {
    'email_address' : "",         # your email address should go here
    'access_token'  : ""          # the access token you create will need to go here
}

#
#    A dictionary of English Wikipedia article titles (keys) and sample revision IDs that can be used for this ORES scoring example
#
ARTICLE_REVISIONS = ores_dict

#
#    This is a template of the data required as a payload when making a scoring request of the ORES model
#
ORES_REQUEST_DATA_TEMPLATE = {
    "lang":        "en",     # required that its english - we're scoring English Wikipedia revisions
    "rev_id":      "",       # this request requires a revision id
    "features":    True
}

#
#    These are used later - defined here so they, at least, have empty values
#
USERNAME = "Manyac28"
ACCESS_TOKEN = config.Access_token
#

In [46]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_ores_score_per_article(article_revid = None, email_address=None, access_token=None,
                                   endpoint_url = API_ORES_LIFTWING_ENDPOINT, 
                                   model_name = API_ORES_EN_QUALITY_MODEL, 
                                   request_data = ORES_REQUEST_DATA_TEMPLATE, 
                                   header_format = REQUEST_HEADER_TEMPLATE, 
                                   header_params = REQUEST_HEADER_PARAMS_TEMPLATE):
    
    #    Make sure we have an article revision id, email and token
    #    This approach prioritizes the parameters passed in when making the call
    if article_revid:
        request_data['rev_id'] = article_revid
    if email_address:
        header_params['email_address'] = email_address
    if access_token:
        header_params['access_token'] = access_token
    
    #   Making a request requires a revision id - an email address - and the access token
    if not request_data['rev_id']:
        raise Exception("Must provide an article revision id (rev_id) to score articles")
    if not header_params['email_address']:
        raise Exception("Must provide an 'email_address' value")
    if not header_params['access_token']:
        raise Exception("Must provide an 'access_token' value")
    
    # Create the request URL with the specified model parameter - default is a article quality score request
    request_url = endpoint_url.format(model_name=model_name)
    
    # Create a compliant request header from the template and the supplied parameters
    headers = dict()
    for key in header_format.keys():
        headers[str(key)] = header_format[key].format(**header_params)
    
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free data
        # source like ORES - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        #response = requests.get(request_url, headers=headers)
        response = requests.post(request_url, headers=headers, data=json.dumps(request_data))
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response


In [56]:
# Requesting ORES label for each article and logging data unavailable errors

error_log = {}
quality_labels = {}
for article_title in ores_dict.keys():
#
# print(f"Getting LiftWing ORES scores for '{article_title}' with revid: {ARTICLE_REVISIONS[article_title]:d}")
#
#    Make the call, just pass in the article revision ID, email address, and access token
    score = request_ores_score_per_article(article_revid=ARTICLE_REVISIONS[article_title],
                                       email_address="manyac@uw.edu",
                                       access_token=ACCESS_TOKEN)
#
#    Output the result
    try:
        for k,v in score['enwiki']['scores'].items():
            try:
                quality_labels[article_title] = v['articlequality']['score']['prediction']
            except Exception as e:
                print(f"No label found for {article_title} due to error {e}")
                error_log[article_title] = e
    except:
        time.sleep(120)
        continue
#

In [64]:
# Checking if all articles have receieved a rating 
# len(error_log)
len(quality_labels) == len(ores_dict)

True

In [63]:
# Saving for the sake of posterity 

with open("quality_labels.json",'w') as f:
    json.dump(quality_labels,f)

In [71]:
# Printing the error rate 
print((len(error_log) + len(unable_to_collect))/len(small_pol_df)) # unable_to_collect contains articles that we didn't get a "lastrevid" for

0.0011250175783996624


Step 3: Combining the Datasets\
Desired output here is a csv with columns country, region, population, article_title, revision_id, article_quality

3.1 - Start by combining dictionaries ores_dict and quality_labels to get revision_id, article_quality and article_title in one dataframe \
3.2 - Since we used small_pol_df, use 3.1 to fill in the gaps in original pol_df \
3.3 - Merge with pop_df

In [81]:
combined_df_1 = pd.DataFrame({'article_quality': quality_labels, 'revision_id': ores_dict})
combined_df_1 = combined_df_1.reset_index().rename({'index':'article_title'},axis=1)
combined_df_1.head(3)

Unnamed: 0,article_title,article_quality,revision_id
0,Majah Ha Adrif,Start,1233202991
1,Haroon al-Afghani,B,1230459615
2,Tayyab Agha,Start,1225661708


In [106]:
# Merging article quality information with original pol_df

pol_df = pol_df.rename({'name':'article_title'},axis=1) # Renaming columns to facilitate merge

# Using 'left' join to retain all rows from pol_df and match ores data where possible, else NaN returned
article_data = pd.merge(pol_df,combined_df_1, on='article_title',how='left') 
article_data.head(3)

Unnamed: 0,article_title,url,country,article_quality,revision_id
0,Majah Ha Adrif,https://en.wikipedia.org/wiki/Majah_Ha_Adrif,Afghanistan,Start,1233203000.0
1,Haroon al-Afghani,https://en.wikipedia.org/wiki/Haroon_al-Afghani,Afghanistan,B,1230460000.0
2,Tayyab Agha,https://en.wikipedia.org/wiki/Tayyab_Agha,Afghanistan,Start,1225662000.0


In [107]:
article_data.describe().T # Shows 7147 non-null values which is in line with observed 8 unable_to_collect values that will have NaN in article_quality and rveision_id

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
revision_id,7147.0,1209723000.0,54835040.0,395521877.0,1193148000.0,1230313000.0,1244075000.0,1251081000.0


In [151]:
# Get the unique country names from both datasets
wikipedia_countries = set(article_data['country'].unique())
population_countries = set(pop_df['Geography'].unique())

# Identify countries that are not present in the other dataset
countries_not_in_population = wikipedia_countries - population_countries
countries_not_in_wikipedia = population_countries - wikipedia_countries

# Combine the unmatched countries and write to a file
all_unmatched_countries = countries_not_in_population.union(countries_not_in_wikipedia)
all_unmatched_countries = {country for country in all_unmatched_countries if not country.isupper()} # Removing the regions

with open('wp_countries-no_match.txt', 'w') as f:
    for country in sorted(all_unmatched_countries):
        f.write(country + '\n')

In [152]:
# Create a mapping function that maps each country to it's region based on region closest to it / lowest in hierarchy

pop_dict = pop_df.to_dict(orient='records')

# Initialize variables
country_region_map = {}
current_region = None

# Iterate over the population data to create the mapping
for entry in pop_dict:
    geography = entry['Geography']
    if geography.isupper():  # It's a region if the name is in ALL CAPS
        current_region = geography
    else:
        country_region_map[geography] = current_region

# Verify mapping
print(country_region_map)

{'Algeria': 'NORTHERN AFRICA', 'Egypt': 'NORTHERN AFRICA', 'Libya': 'NORTHERN AFRICA', 'Morocco': 'NORTHERN AFRICA', 'Sudan': 'NORTHERN AFRICA', 'Tunisia': 'NORTHERN AFRICA', 'Western Sahara': 'NORTHERN AFRICA', 'Benin': 'WESTERN AFRICA', 'Burkina Faso': 'WESTERN AFRICA', 'Cape Verde': 'WESTERN AFRICA', "Cote d'Ivoire": 'WESTERN AFRICA', 'Gambia': 'WESTERN AFRICA', 'Ghana': 'WESTERN AFRICA', 'Guinea': 'WESTERN AFRICA', 'GuineaBissau': 'WESTERN AFRICA', 'Liberia': 'WESTERN AFRICA', 'Mali': 'WESTERN AFRICA', 'Mauritania': 'WESTERN AFRICA', 'Niger': 'WESTERN AFRICA', 'Nigeria': 'WESTERN AFRICA', 'Senegal': 'WESTERN AFRICA', 'Sierra Leone': 'WESTERN AFRICA', 'Togo': 'WESTERN AFRICA', 'Burundi': 'EASTERN AFRICA', 'Comoros': 'EASTERN AFRICA', 'Djibouti': 'EASTERN AFRICA', 'Eritrea': 'EASTERN AFRICA', 'Ethiopia': 'EASTERN AFRICA', 'Kenya': 'EASTERN AFRICA', 'Madagascar': 'EASTERN AFRICA', 'Malawi': 'EASTERN AFRICA', 'Mauritius': 'EASTERN AFRICA', 'Mayotte': 'EASTERN AFRICA', 'Mozambique': 'EA

In [166]:
# Merging article_data and population info from pop_df on column 'country'

pop_df_for_merge = pop_df.rename({'Geography':'country'},axis=1)

merged_df = pd.merge(article_data,pop_df_for_merge,on='country',how='inner')
merged_df.head()

Unnamed: 0,article_title,url,country,article_quality,revision_id,Population
0,Majah Ha Adrif,https://en.wikipedia.org/wiki/Majah_Ha_Adrif,Afghanistan,Start,1233203000.0,42.4
1,Haroon al-Afghani,https://en.wikipedia.org/wiki/Haroon_al-Afghani,Afghanistan,B,1230460000.0,42.4
2,Tayyab Agha,https://en.wikipedia.org/wiki/Tayyab_Agha,Afghanistan,Start,1225662000.0,42.4
3,Khadija Zahra Ahmadi,https://en.wikipedia.org/wiki/Khadija_Zahra_Ah...,Afghanistan,Stub,1234742000.0,42.4
4,Aziza Ahmadyar,https://en.wikipedia.org/wiki/Aziza_Ahmadyar,Afghanistan,Start,1195651000.0,42.4


In [167]:
# Add the 'region' column using the mapping
merged_df['region'] = merged_df['country'].map(country_region_map)
merged_df.head(3)

Unnamed: 0,article_title,url,country,article_quality,revision_id,Population,region
0,Majah Ha Adrif,https://en.wikipedia.org/wiki/Majah_Ha_Adrif,Afghanistan,Start,1233203000.0,42.4,SOUTH ASIA
1,Haroon al-Afghani,https://en.wikipedia.org/wiki/Haroon_al-Afghani,Afghanistan,B,1230460000.0,42.4,SOUTH ASIA
2,Tayyab Agha,https://en.wikipedia.org/wiki/Tayyab_Agha,Afghanistan,Start,1225662000.0,42.4,SOUTH ASIA


In [199]:
merged_df.shape

(7013, 7)

In [170]:
# Save the csvs
merged_df.drop('url',axis=1).to_csv("wp_politicians_by_country.csv")

Step 4: Analysis \
4.1 - Calculate total articles per capita for (a) country and (b) region \
4.2 - Calculate high quality articles per capita for (a) country and (b) region 

Analysis 4.1 (a) : Country-wise total articles per capita

In [20]:
merged_df = pd.read_csv("wp_politicians_by_country.csv").drop("Unnamed: 0", axis=1)
merged_df.head(3)

Unnamed: 0,article_title,country,article_quality,revision_id,Population,region
0,Majah Ha Adrif,Afghanistan,Start,1233203000.0,42.4,SOUTH ASIA
1,Haroon al-Afghani,Afghanistan,B,1230460000.0,42.4,SOUTH ASIA
2,Tayyab Agha,Afghanistan,Start,1225662000.0,42.4,SOUTH ASIA


In [21]:
country_wise_article_coverage = merged_df.groupby('country').agg(article_count = ('country', 'size'),
                                                                       Population = ('Population', 'mean'),
                                                                       region = ('region', 'first')).reset_index()
# Per capita metrics 
country_wise_article_coverage["Article Count Per Million People"] = country_wise_article_coverage['article_count'] / country_wise_article_coverage['Population']

Result 1 : The top 10 countries with the highest total articles per capita

In [23]:
# Top 10 countries by coverage: The 10 countries with the highest total articles per capita

country_wise_article_coverage.sort_values(by='Article Count Per Million People',ascending=False).reset_index().drop(['article_count',
                                                                                                       'Population',
                                                                                                       'region','index'],axis=1).head(10)

Unnamed: 0,country,Article Count Per Million People
0,Monaco,inf
1,Tuvalu,inf
2,Antigua and Barbuda,330.0
3,Federated States of Micronesia,140.0
4,Marshall Islands,130.0
5,Tonga,100.0
6,Barbados,83.333333
7,Montenegro,60.0
8,Seychelles,60.0
9,Bhutan,55.0


Result 2 : The bottom 10 countries with the lowest total articles per capita

In [24]:
# Bottom 10 countries by coverage: The 10 countries with the lowest total articles per capita

country_wise_article_coverage.sort_values(by='Article Count Per Million People',ascending=True).reset_index().drop(['article_count',
                                                                                                       'Population',
                                                                                                       'region','index'],axis=1).head(10)

Unnamed: 0,country,Article Count Per Million People
0,China,0.011337
1,India,0.105698
2,Ghana,0.117302
3,Saudi Arabia,0.135501
4,Zambia,0.148515
5,Norway,0.181818
6,Israel,0.204082
7,Egypt,0.304183
8,Cote d'Ivoire,0.323625
9,Ethiopia,0.347826


Analysis 4.2 (a) : Country-wise high quality articles per capita

In [25]:
# Only considering 'FA' and 'GA' as high quality article indicators

high_quality_indicators = ['FA','GA']
filtered_df = merged_df[merged_df['article_quality'].isin(high_quality_indicators)]
filtered_df.shape

(304, 6)

In [26]:
country_wise_hq_articles = filtered_df.groupby('country').agg(article_count = ('country', 'size'),
                                                                       Population = ('Population', 'mean'),
                                                                       region = ('region', 'first')).reset_index()

# Per capita metrics 
country_wise_hq_articles["Article Count Per Million People"] = country_wise_hq_articles['article_count'] / country_wise_hq_articles['Population']

Result 3 : The 10 countries with the highest high quality articles per capita

In [27]:
country_wise_hq_articles.sort_values(by='Article Count Per Million People',ascending=False).reset_index().drop(['article_count',
                                                                                                       'Population',
                                                                                                       'region','index'],axis=1).head(10)

Unnamed: 0,country,Article Count Per Million People
0,Montenegro,5.0
1,Luxembourg,2.857143
2,Albania,2.592593
3,Kosovo,2.352941
4,Maldives,1.666667
5,Lithuania,1.37931
6,Croatia,1.315789
7,Guyana,1.25
8,Palestinian Territory,1.090909
9,Slovenia,0.952381


Result 4: The 10 countries with the lowest high quality articles per capita

In [28]:
# Note: This table lists countries that have ATLEAST one high quality article available for them.

country_wise_hq_articles.sort_values(by='Article Count Per Million People').reset_index().drop(['article_count',
                                                                                                       'Population',
                                                                                                       'region','index'],axis=1).head(10)

Unnamed: 0,country,Article Count Per Million People
0,Bangladesh,0.005764
1,Egypt,0.009506
2,Ethiopia,0.01581
3,Japan,0.016064
4,Pakistan,0.016632
5,Colombia,0.019157
6,Congo DR,0.01955
7,Vietnam,0.020222
8,Uganda,0.020576
9,Algeria,0.021368


In [29]:
# This table lists countries with no high quality article available for them.

countries_in_df1 = set(merged_df['country'])
countries_in_df2 = set(filtered_df['country'])

# Get the countries that are in df1 but not in df2
countries_not_in_df2 = countries_in_df1 - countries_in_df2
display(countries_not_in_df2)

print(f"There are {len(countries_not_in_df2)} countries with no high quality articles associated with them.")

{'Antigua and Barbuda',
 'Bahamas',
 'Barbados',
 'Belize',
 'Benin',
 'Bhutan',
 'Botswana',
 'Cape Verde',
 'Chad',
 'China',
 'Comoros',
 'Congo',
 "Cote d'Ivoire",
 'Cyprus',
 'Djibouti',
 'Ecuador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Federated States of Micronesia',
 'Gambia',
 'Grenada',
 'Haiti',
 'Honduras',
 'India',
 'Israel',
 'Kuwait',
 'Laos',
 'Lesotho',
 'Liberia',
 'Malawi',
 'Malaysia',
 'Malta',
 'Marshall Islands',
 'Monaco',
 'Mozambique',
 'Namibia',
 'Nicaragua',
 'Niger',
 'Norway',
 'Oman',
 'Paraguay',
 'Qatar',
 'Samoa',
 'Senegal',
 'Seychelles',
 'Sierra Leone',
 'Singapore',
 'Solomon Islands',
 'St. Kitts and Nevis',
 'St. Lucia',
 'St. Vincent and the Grenadines',
 'Taiwan',
 'Tanzania',
 'Timor Leste',
 'Togo',
 'Tonga',
 'Turkey',
 'Turkmenistan',
 'Tuvalu',
 'Uzbekistan',
 'Vanuatu',
 'Yemen',
 'Zambia',
 'Zimbabwe'}

There are 65 countries with no high quality articles associated with them.


Analysis 4.1 (b) : Region-wise total articles per capita

In [30]:
region_wise_article_coverage = merged_df.groupby('region').agg(article_count = ('region', 'size')).reset_index()

# merge to find accurate population counts
pop_df.columns = ["region", "Population"]
region_wise_article_coverage = pd.merge(region_wise_article_coverage,
                                      pop_df,
                                      how = "left",
                                      on = "region")

# Per capita metrics
region_wise_article_coverage['Article Count Per Million People'] = region_wise_article_coverage['article_count'] / region_wise_article_coverage['Population']
region_wise_article_coverage.head(3)

Unnamed: 0,region,article_count,Population,Article Count Per Million People
0,CARIBBEAN,219,44.0,4.977273
1,CENTRAL AMERICA,188,182.0,1.032967
2,CENTRAL ASIA,106,80.0,1.325


Result 5: A rank ordered list of geographic regions (in descending order) by total articles per capita.

In [31]:
# Geographic regions by total coverage: A rank ordered list of geographic regions (in descending order) by total articles per capita.

region_wise_article_coverage.sort_values(by='Article Count Per Million People',ascending=False).reset_index().drop(['article_count',
                                                                                                       'Population',
                                                                                                       'index'],axis=1)

Unnamed: 0,region,Article Count Per Million People
0,SOUTHERN EUROPE,5.243421
1,CARIBBEAN,4.977273
2,WESTERN EUROPE,2.502513
3,EASTERN EUROPE,2.487719
4,WESTERN ASIA,2.040134
5,NORTHERN EUROPE,1.768519
6,SOUTHERN AFRICA,1.757143
7,OCEANIA,1.6
8,EASTERN AFRICA,1.376812
9,SOUTH AMERICA,1.335681


Analysis 4.2 (b): Region-wise high quality articles per capita

In [32]:
region_wise_hq_articles = filtered_df.groupby('region').agg(article_count = ('region', 'size')).reset_index()

# merge to find accurate population counts
pop_df.columns = ["region", "Population"]
region_wise_hq_articles = pd.merge(region_wise_hq_articles,
                                      pop_df,
                                      how = "left",
                                      on = "region")

# Per capita metrics
region_wise_hq_articles['Article Count Per Million People'] = region_wise_hq_articles['article_count'] / region_wise_hq_articles['Population']
region_wise_hq_articles.head(3)

Unnamed: 0,region,article_count,Population,Article Count Per Million People
0,CARIBBEAN,9,44.0,0.204545
1,CENTRAL AMERICA,10,182.0,0.054945
2,CENTRAL ASIA,5,80.0,0.0625


In [33]:
# Geographic regions by total coverage: A rank ordered list of geographic regions (in descending order) by total articles per capita.

region_wise_hq_articles.sort_values(by='Article Count Per Million People',ascending=False).reset_index().drop(['article_count',
                                                                                                       'Population',
                                                                                                       'index'],axis=1)

Unnamed: 0,region,Article Count Per Million People
0,SOUTHERN EUROPE,0.348684
1,CARIBBEAN,0.204545
2,EASTERN EUROPE,0.133333
3,SOUTHERN AFRICA,0.114286
4,WESTERN EUROPE,0.105528
5,WESTERN ASIA,0.090301
6,NORTHERN EUROPE,0.083333
7,NORTHERN AFRICA,0.066406
8,CENTRAL ASIA,0.0625
9,CENTRAL AMERICA,0.054945
