## Data 512 A Assignment 2 - Bias in Data

### Import necessary libraries

In [66]:
%matplotlib inline
import requests
import json
import pandas as pd
import numpy as np

### Define a utility function to pre-process the data and get prediction scores from the endpoint

In [48]:
def get_ores_data(revision_ids):
    """ Function to split revision IDs into smaller chunks before getting the quality scores using ORES API endpoint
    """
    # Define the endpoint
    endpoint = 'https://ores.wikimedia.org/v3/scores/enwiki/?models=wp10&revids={}'
    
    output_revision_ids = []
    predictions = []
    
    chunk_size = 100
    
    L = len(revision_ids)
    for  i in range(0, L, chunk_size):
        try:
            chunk_rev_id = revision_ids[i:i + chunk_size]
        except:
            chunk_rev_id = revision_ids[i:]
            
        formatted_url = endpoint.format("|".join(map(str, chunk_rev_id)))
        
        api_call = requests.get(formatted_url)
        response = api_call.json()
        prob_dict =  response['enwiki']['scores']
    
    
        for rev_id, result in prob_dict.items():
            output_revision_ids.append(rev_id)
            predictions.append(result['wp10'].get('score', {}).get('prediction', 'N/A'))
            
        print("processed rev IDs", i + chunk_size)
        
    return_df = pd.DataFrame({'revision_id' : output_revision_ids, 'predictions' : predictions})
    
    return return_df

## Read the Wikipedia Dataset and the Population Data set into Dataframes

In [4]:
population = pd.read_csv('WPDS_2018_data.csv') 
wiki_data = pd.read_csv('page_data.csv')

In [5]:
# inspect population data
population.head()

Unnamed: 0,Geography,Population mid-2018 (millions)
0,AFRICA,1284.0
1,Algeria,42.7
2,Egypt,97.0
3,Libya,6.5
4,Morocco,35.2


In [7]:
# inspect wiki data
wiki_data.head()

Unnamed: 0,page,country,rev_id
0,Template:ZambiaProvincialMinisters,Zambia,235107991
1,Bir I of Kanem,Chad,355319463
2,Template:Zimbabwe-politician-stub,Zimbabwe,391862046
3,Template:Uganda-politician-stub,Uganda,391862070
4,Template:Namibia-politician-stub,Namibia,391862409


## Join the population Dataframe with the wiki data

In [9]:
combined_df = pd.merge(population, wiki_data, how='inner', right_on='country', left_on='Geography')
combined_df.head()

Unnamed: 0,Geography,Population mid-2018 (millions),page,country,rev_id
0,Algeria,42.7,Template:Algeria-politician-stub,Algeria,544347736
1,Algeria,42.7,Template:Algeria-diplomat-stub,Algeria,567620838
2,Algeria,42.7,Template:AlgerianPres,Algeria,665948270
3,Algeria,42.7,Ali Fawzi Rebaine,Algeria,686269631
4,Algeria,42.7,Ahmed Attaf,Algeria,705910185


## Use the ORES API to get the quality Scores for the Wikipedia Articles

In [47]:
quality_scores = get_ores_data(combined_df['rev_id'])

processed rev IDs 100
processed rev IDs 200
processed rev IDs 300
processed rev IDs 400
processed rev IDs 500
processed rev IDs 600
processed rev IDs 700
processed rev IDs 800
processed rev IDs 900
processed rev IDs 1000
processed rev IDs 1100
processed rev IDs 1200
processed rev IDs 1300
processed rev IDs 1400
processed rev IDs 1500
processed rev IDs 1600
processed rev IDs 1700
processed rev IDs 1800
processed rev IDs 1900
processed rev IDs 2000
processed rev IDs 2100
processed rev IDs 2200
processed rev IDs 2300
processed rev IDs 2400
processed rev IDs 2500
processed rev IDs 2600
processed rev IDs 2700
processed rev IDs 2800
processed rev IDs 2900
processed rev IDs 3000
processed rev IDs 3100
processed rev IDs 3200
processed rev IDs 3300
processed rev IDs 3400
processed rev IDs 3500
processed rev IDs 3600
processed rev IDs 3700
processed rev IDs 3800
processed rev IDs 3900
processed rev IDs 4000
processed rev IDs 4100
processed rev IDs 4200
processed rev IDs 4300
processed rev IDs 44

processed rev IDs 34700
processed rev IDs 34800
processed rev IDs 34900
processed rev IDs 35000
processed rev IDs 35100
processed rev IDs 35200
processed rev IDs 35300
processed rev IDs 35400
processed rev IDs 35500
processed rev IDs 35600
processed rev IDs 35700
processed rev IDs 35800
processed rev IDs 35900
processed rev IDs 36000
processed rev IDs 36100
processed rev IDs 36200
processed rev IDs 36300
processed rev IDs 36400
processed rev IDs 36500
processed rev IDs 36600
processed rev IDs 36700
processed rev IDs 36800
processed rev IDs 36900
processed rev IDs 37000
processed rev IDs 37100
processed rev IDs 37200
processed rev IDs 37300
processed rev IDs 37400
processed rev IDs 37500
processed rev IDs 37600
processed rev IDs 37700
processed rev IDs 37800
processed rev IDs 37900
processed rev IDs 38000
processed rev IDs 38100
processed rev IDs 38200
processed rev IDs 38300
processed rev IDs 38400
processed rev IDs 38500
processed rev IDs 38600
processed rev IDs 38700
processed rev ID

In [50]:
# Inspect the quality score Data Frame
quality_scores.head()

Unnamed: 0,revision_id,predictions
0,544347736,Stub
1,567620838,Stub
2,665948270,Stub
3,686269631,Stub
4,705910185,Stub


In [53]:
# Convert rev_id column in combined_df to type int64
combined_df['rev_id'] = combined_df['rev_id'].astype(int)

In [58]:
# Convert revision_id column in the quality scores df to type int64
quality_scores['revision_id'] = quality_scores['revision_id'].astype(int)

## Join the quality scores Data Frame with the combined Data Frame

In [59]:
combined_quality_df = pd.merge(combined_df, quality_scores, how='inner', left_on='rev_id', right_on='revision_id')

In [60]:
## Inspect combined_quality_df
combined_quality_df.head()

Unnamed: 0,Geography,Population mid-2018 (millions),page,country,rev_id,revision_id,predictions
0,Algeria,42.7,Template:Algeria-politician-stub,Algeria,544347736,544347736,Stub
1,Algeria,42.7,Template:Algeria-diplomat-stub,Algeria,567620838,567620838,Stub
2,Algeria,42.7,Template:AlgerianPres,Algeria,665948270,665948270,Stub
3,Algeria,42.7,Ali Fawzi Rebaine,Algeria,686269631,686269631,Stub
4,Algeria,42.7,Ahmed Attaf,Algeria,705910185,705910185,Stub


## Persist the results in a CSV file

In [62]:
persist_df = pd.DataFrame()

persist_df['country'] = combined_quality_df['country']
persist_df['article_name'] = combined_quality_df['page']
persist_df['revision_id'] = combined_quality_df['rev_id']
persist_df['article_quality'] = combined_quality_df['predictions']
persist_df['population'] = combined_quality_df['Population mid-2018 (millions)']

# Inspect persist_df before writing to disk
persist_df.head()

Unnamed: 0,country,article_name,revision_id,article_quality,population
0,Algeria,Template:Algeria-politician-stub,544347736,Stub,42.7
1,Algeria,Template:Algeria-diplomat-stub,567620838,Stub,42.7
2,Algeria,Template:AlgerianPres,665948270,Stub,42.7
3,Algeria,Ali Fawzi Rebaine,686269631,Stub,42.7
4,Algeria,Ahmed Attaf,705910185,Stub,42.7


In [63]:
# Write the persist_df to a csv file
persist_df.to_csv('final_data.csv')

## Read the Persisted CSV file for Analysis

In [169]:
persist_df = pd.read_csv('final_data.csv')

In [170]:
# Convert population to a floating point value
persist_df['population'] = persist_df['population'].apply(lambda x : float(x.replace(',', '')))

In [171]:
# Compute the Aggregated columns
table_1 = persist_df.groupby('country').agg({'revision_id' : lambda x : len(x), 'population' : np.mean})

In [172]:
# Compute the proportion of articles wrt the population of the country
table_1['proportion_%'] = table_1.apply(lambda x : x['revision_id'] / x['population'] / (10 ** 6) * 100, axis='columns')
table_1.rename(columns={'revision_id' : 'Article Counts', 'population' : 'Population in Millions'}, inplace=True)

## 10 highest-ranked countries in terms of number of politician articles as a proportion of country population

In [173]:
table_1_top_10 = table_1.sort_values('proportion_%', ascending=False)[:10]
table_1_top_10.head(10)

Unnamed: 0_level_0,Article Counts,Population in Millions,proportion_%
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Tuvalu,55,0.01,0.55
Nauru,53,0.01,0.53
San Marino,82,0.03,0.273333
Monaco,40,0.04,0.1
Liechtenstein,29,0.04,0.0725
Tonga,63,0.1,0.063
Marshall Islands,37,0.06,0.061667
Iceland,206,0.4,0.0515
Andorra,34,0.08,0.0425
Federated States of Micronesia,38,0.1,0.038


### Reflection
* Most of the countries in this list tend to have relatively low population and as a result tend to have higher proportion of articles wrt the population.
* This analysis can be biased, because of the fact that certain countries with low population may have a high proportion of people who are illiterate and as a result have very few articles written in general. 
* It is also possible that the proportion is abnormally high for a country with low population because of a select few people writing a lot of political articles. 

## 10 lowest-ranked countries in terms of number of politician articles as a proportion of country population

In [174]:
table_1_bottom_10 = table_1.sort_values('proportion_%', ascending=True)[:10]
table_1_bottom_10.head(10)

Unnamed: 0_level_0,Article Counts,Population in Millions,proportion_%
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
India,990,1371.3,7.2e-05
Indonesia,215,265.2,8.1e-05
China,1138,1393.8,8.2e-05
Uzbekistan,29,32.9,8.8e-05
Ethiopia,105,107.5,9.8e-05
Zambia,26,17.7,0.000147
"Korea, North",39,25.6,0.000152
Thailand,112,66.2,0.000169
Bangladesh,324,166.4,0.000195
Mozambique,60,30.5,0.000197


### Reflection
* This analysis can also be biased because of the fact that some countries like India and China have a relatively high population and as a result have lower proportion value
* Another source of bias could be the variablity of literacy rate among countries

## Consider High quality articles only 

In [175]:
persist_df['high_quality'] = persist_df['article_quality'].apply(lambda x : 1 if x in ('GA', 'FA') else 0)
persist_df.head()

Unnamed: 0.1,Unnamed: 0,country,article_name,revision_id,article_quality,population,high_quality
0,0,Algeria,Template:Algeria-politician-stub,544347736,Stub,42.7,0
1,1,Algeria,Template:Algeria-diplomat-stub,567620838,Stub,42.7,0
2,2,Algeria,Template:AlgerianPres,665948270,Stub,42.7,0
3,3,Algeria,Ali Fawzi Rebaine,686269631,Stub,42.7,0
4,4,Algeria,Ahmed Attaf,705910185,Stub,42.7,0


In [176]:
# Compute aggregated columns ie. article counts and population

table_2 = persist_df.groupby('country').agg({'revision_id' : lambda x : len(x), 'high_quality' : np.sum})
table_2['proportion_%'] = table_2.apply(lambda x : x['high_quality'] / x['revision_id'] / 10000, 
                                        axis='columns')
table_2.rename(columns={'revision_id' : 'Article Counts', 'population' : 'Population in Millions'}, inplace=True)

### 10 highest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles about politicians from that country

In [177]:
table_2_top_10 = table_2.sort_values('proportion_%', ascending=False)[:10]
table_2_top_10.head(10)

Unnamed: 0_level_0,Article Counts,high_quality,proportion_%
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Korea, North",39,7,1.8e-05
Saudi Arabia,119,16,1.3e-05
Central African Republic,68,8,1.2e-05
Romania,348,40,1.1e-05
Mauritania,52,5,1e-05
Bhutan,33,3,9e-06
Tuvalu,55,5,9e-06
Dominica,12,1,8e-06
United States,1098,82,7e-06
Benin,94,7,7e-06


### Reflection
* This analysis could be biased because of the fact that certain countries have higher degree of freedom of speech than others
* If the freedom of speech is restricted in certain countries, fewer people are likely to contribute articles and those that do contribute articles tend to be professional writers that produce higher quality articles

### 10 lowest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles about politicians from that country

In [179]:
# List all the countries with zero high quality articles and sort them in the descending order of the number of articles
table_2_bottom_10 = table_2.sort_values('proportion_%', ascending=True)
table_2_bottom_10 = table_2_bottom_10[table_2_bottom_10['high_quality'] < 0.000001].sort_values('Article Counts', ascending=False)[:10]
table_2_bottom_10.head(10)

Unnamed: 0_level_0,Article Counts,high_quality,proportion_%
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Finland,572,0,0.0
Belgium,523,0,0.0
Moldova,426,0,0.0
Switzerland,407,0,0.0
Nepal,363,0,0.0
Uganda,188,0,0.0
Costa Rica,150,0,0.0
Tunisia,140,0,0.0
Slovakia,119,0,0.0
Angola,110,0,0.0


### Reflection
* Here we see a mix of developed and underdeveloped countries in the list with no high quality articles
* It is possible that people in certain developed and underdeveloped countries don't contribute a lot of political articles
* It's also possible that the data in not comprehensive
* Any conclusions we draw from this analysis will definitely be full of Bias.
