In [254]:
import pandas as pd
import numpy as np
import os
import json
import requests
import pickle

First we will read in the page data and the wpds data from csv files

In [255]:
filename_pagedata = "./page_data.csv"
filename_wpds = "./WPDS_2020_data.csv"
df_pagedata = pd.read_csv(filename_pagedata)
df_wpds =pd.read_csv(filename_wpds)


Next we will filter the data so that the page data does not contain and "template" articles, and the wdps only data contains data from a country graunulartiy

In [256]:
df_pagedata_filtered = df_pagedata[~df_pagedata['page'].str.startswith('Template')]
df_wpds_cumulative = df_wpds[df_wpds['Name'].str.isupper()]
df_wpds_filtered = df_wpds[~df_wpds['Name'].str.isupper()]

Here we are defining the constants for the api call

In [257]:
pagedata_endpoint = "https://ores.wikimedia.org/v3/scores/{context}/{revid}/{model}"
headers = {
    'User-Agent': 'https://github.com/mayapatward',
    'From': 'mp97@uw.edu'
}

pagedata_params = {"context" : "enwiki",
                    "revid" : "1234",
                    "model" : "articlequality",
                        }

Next we will define the api call. Since we are individually calling the api, this program runs very slow. So in order to avoid running this call more than once, I saved the results into a pickle file, and then loaded them into a pandas frame called df_pagedata_filtered

In [258]:
def api_call(endpoint,revid_num, revid_missing_list, page):
    call = requests.get(endpoint.format(context = "enwiki", revid = revid_num, model = "articlequality"), headers=headers)
    try:
        response = call.json()
        return response['enwiki']['scores'][str(revid_num)]['articlequality']['score']['prediction'] 
    except: 
        revid_missing_list.append((revid_num, page))
        return None
    
    

In [259]:
# revid_missing_list = []
# df_pagedata_filtered["prediction"] = df_pagedata_filtered.apply(lambda row: api_call(pagedata_endpoint,row['rev_id'], revid_missing_list, row['page']), axis =1)
# file='revid_missing.txt' 
# with open(file, 'w') as filetowrite:
#     for item in revid_missing_list:  
#         filetowrite.write(f"{str(item[0])}, {item[1]}\n")
# df_pagedata_filtered.to_pickle("./df_pagedata_filtered")


In [234]:
df_pagedata_filtered =pd.read_pickle('../df_pagedata_filtered')

Here we we are combining the rows and removing any data that do not have matching rows. This data is stored in "wp_wpds_countries-no_match.csv"

In [235]:
df_merged = pd.merge(df_pagedata_filtered, df_wpds_filtered, how='outer',
                left_on='country', right_on='Name',
                indicator=True)

df_merged_nomatch = df_merged.query('_merge != "both"')
df_merged_match = df_merged.query('_merge == "both"')

In [236]:
file='wp_wpds_countries-no_match.csv' 
df_merged_nomatch.to_csv(file)


Next we are processing the merged dataframe to have the correct column names and saving this file to wp_wpds_politicians_by_country

In [237]:
df_merged_match = df_merged_match[['country', 'page', 'rev_id', 'prediction', 'Population']]
df_merged_match.rename(columns={"page": "article_name", "rev_id": "revision_id", "prediction":"article_quality_est",\
                                "Population":"population"}, inplace = True)
file='wp_wpds_politicians_by_country.csv' 
df_merged_match.to_csv(file)

Finally, we will begin to aggregate the data to find the coverage estimates

In [241]:
df_coverage = df_merged_match.groupby('country').agg({'country':'size', 'population':'mean'})

In [242]:
df_coverage['percent_coverage'] = df_coverage['country']/df_coverage['population']

Top 10 countries by coverage: 10 highest-ranked countries in terms of number of politician articles as a proportion of country population

In [243]:
df_coverage.sort_values('percent_coverage', ascending = False)[:10]

Unnamed: 0_level_0,country,population,percent_coverage
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Tuvalu,54,10000.0,0.0054
Nauru,52,11000.0,0.004727
San Marino,81,34000.0,0.002382
Monaco,40,38000.0,0.001053
Liechtenstein,28,39000.0,0.000718
Marshall Islands,37,57000.0,0.000649
Tonga,63,99000.0,0.000636
Iceland,202,368000.0,0.000549
Andorra,34,82000.0,0.000415
Federated States of Micronesia,36,106000.0,0.00034


Bottom 10 countries by coverage: 10 lowest-ranked countries in terms of number of politician articles as a proportion of country population

In [244]:
df_coverage.sort_values('percent_coverage', ascending = True)[:10]

Unnamed: 0_level_0,country,population,percent_coverage
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
India,985,1400100000.0,7.035212e-07
Indonesia,211,271739000.0,7.764804e-07
China,1133,1402385000.0,8.079094e-07
Uzbekistan,28,34174000.0,8.193363e-07
Ethiopia,101,114916000.0,8.789029e-07
Zambia,25,18384000.0,1.359878e-06
"Korea, North",36,25779000.0,1.396486e-06
Thailand,112,66534000.0,1.68335e-06
Mozambique,58,31166000.0,1.861002e-06
Bangladesh,321,169809000.0,1.890359e-06


Next, we will begin to aggregate the data to find the quality estimates

In [245]:
df_quality = df_merged_match.copy()

In [246]:
df_quality['article_quality_est_count'] = df_quality['article_quality_est'].apply(lambda x: 1 if (x=='GA' or x=='FA') \
                                                                                        else 0)

In [247]:
df_quality = df_quality.groupby('country').agg({'country':'size', 'article_quality_est_count':'sum'})

In [248]:
df_quality['percent_quality'] = df_quality['article_quality_est_count']/df_quality['country']

Top 10 countries by relative quality: 10 highest-ranked countries in terms of the relative proportion of politician articles that are of GA and FA-quality

In [249]:
df_quality.sort_values('percent_quality', ascending = False)[:10]

Unnamed: 0_level_0,country,article_quality_est_count,percent_quality
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Korea, North",36,8,0.222222
Saudi Arabia,118,15,0.127119
Romania,343,42,0.122449
Central African Republic,66,8,0.121212
Uzbekistan,28,3,0.107143
Mauritania,48,5,0.104167
Guatemala,83,7,0.084337
Dominica,12,1,0.083333
Syria,129,10,0.077519
Benin,91,7,0.076923


Bottom 10 countries by relative quality: 10 lowest-ranked countries in terms of the relative proportion of politician articles that are of GA and FA-quality


In [250]:
df_quality.sort_values('percent_quality', ascending = True)[:10]

Unnamed: 0_level_0,country,article_quality_est_count,percent_quality
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Guadeloupe,49,0,0.0
Tonga,63,0,0.0
Solomon Islands,97,0,0.0
San Marino,81,0,0.0
Guyana,20,0,0.0
Tunisia,138,0,0.0
Sao Tome and Principe,21,0,0.0
Grenada,36,0,0.0
Kazakhstan,78,0,0.0
Turkmenistan,32,0,0.0


Here, we begin to process the data based on the geographical region. In order to do that, we need to seperate out the continent level data with the sub-region level data. Next we will use their original index to slice the WPDS data and assign the continent/subregion name to the row as a new column

In [251]:
continents = ['AFRICA', 'NORTH AMERICA', 'LATIN AMERICA AND THE CARIBBEAN', 'ASIA','EUROPE', 'OCEANIA']
curr_cont = 'AFRICA'
curr_region = ''
df_wpds["Continent"] = ""
df_wpds["Region"] = ""

df_wpds_new = pd.DataFrame(columns=df_wpds.columns)
for index, row in df_wpds.iterrows():
    
    if index < 2:
        df_wpds_new = df_wpds_new.append(row)
        continue
    elif row['Type'] == 'Sub-Region' and row['Name'] not in continents:
        curr_region = row['Name']        
    elif row['Type'] == 'Sub-Region' and row['Name'] in continents:
        curr_cont = row['Name']
        curr_region = ''
    row['Continent'] = curr_cont
    row['Region'] = curr_region
    df_wpds_new = df_wpds_new.append(row)
        

Here, we are processing the data for the coverage analysis

In [252]:
df_agg = pd.merge(df_pagedata_filtered, df_wpds_new, left_on = 'country', right_on = 'Name', how = 'inner')
cont_agg_5 = pd.merge(df_agg, df_wpds_cumulative[['Name', 'Population']], left_on = 'Continent', right_on ='Name', how = 'left', suffixes=('', '_continent'))
cont_agg_5 = cont_agg_5.groupby('Continent').agg({'country': 'size', 'Population_continent':'mean'})
reg_agg_5 = pd.merge(df_agg, df_wpds_cumulative[['Name', 'Population']], left_on = 'Region', right_on ='Name', how = 'left', suffixes=('', '_region'))
reg_agg_5 = reg_agg_5.groupby('Region').agg({'country': 'size', 'Population_region':'mean'})
cont_agg_5['percent_coverage'] = cont_agg_5['country']/cont_agg_5['Population_continent']
reg_agg_5['percent_coverage'] = reg_agg_5['country']/reg_agg_5['Population_region']
cont_agg_5 = cont_agg_5.reset_index()
reg_agg_5 = reg_agg_5.reset_index()
cont_agg_5 = cont_agg_5.rename(columns = {'Continent': 'Region', 'Population_continent': 'Population_region'})
df_coverage = pd.concat([reg_agg_5, cont_agg_5], ignore_index=True, sort=False)[1:]
df_coverage.sort_values('percent_coverage', ascending = False)[:10]


Unnamed: 0,Region,country,Population_region,percent_coverage
23,OCEANIA,3132,43155000.0,7.3e-05
15,SOUTHERN EUROPE,3729,153251000.0,2.4e-05
18,WESTERN EUROPE,4577,195479000.0,2.3e-05
21,EUROPE,15858,746622000.0,2.1e-05
1,CARIBBEAN,697,43233000.0,1.6e-05
7,EASTERN EUROPE,3771,291902000.0,1.3e-05
14,SOUTHERN AFRICA,635,67732000.0,9e-06
17,WESTERN ASIA,2580,280927000.0,9e-06
2,CENTRAL AMERICA,1545,178611000.0,9e-06
22,LATIN AMERICA AND THE CARIBBEAN,5284,651036000.0,8e-06


Finally we are processing the data for the quality analysis

In [253]:
df_agg_6 =df_agg.copy()
df_agg_6['article_quality_est_count'] = df_agg_6['prediction'].apply(lambda x: 1 if (x=='GA' or x=='FA') \
                                                                                        else 0)

df_agg_6_cont = df_agg_6.groupby('Continent').agg({'Continent':'size', 'article_quality_est_count':'sum'})
df_agg_6_reg = df_agg_6.groupby('Region').agg({'Region':'size', 'article_quality_est_count':'sum'})

df_agg_6_cont = df_agg_6_cont.rename(columns = {'Continent': 'count'}).reset_index()
df_agg_6_cont = df_agg_6_cont.rename(columns = {'Continent': 'Region'})
df_agg_6_reg = df_agg_6_reg.rename(columns = {'Region': 'count'}).reset_index()[1:]
df_quality = pd.concat([df_agg_6_reg, df_agg_6_cont], ignore_index=True, sort=False)
df_quality['percent_quality'] = df_quality['article_quality_est_count']/df_quality['count']
df_quality.sort_values('percent_quality', ascending = False)[:10]

Unnamed: 0,Region,count,article_quality_est_count,percent_quality
9,NORTHERN AMERICA,1940,104,0.053608
12,SOUTHEAST ASIA,2034,73,0.03589
16,WESTERN ASIA,2580,89,0.034496
6,EASTERN EUROPE,3771,118,0.031291
4,EAST ASIA,2477,76,0.030682
2,CENTRAL ASIA,247,7,0.02834
3,Channel Islands,3781,102,0.026977
19,ASIA,11767,316,0.026855
18,AFRICA,8801,223,0.025338
7,MIDDLE AFRICA,669,16,0.023916
