<a href="https://colab.research.google.com/github/niafthomas/niafthomas.github.io/blob/main/gini_smAPIscraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# data manipulation
import numpy as np
import pandas as pd

In [2]:
# World bank data package
!pip install world-bank-data
import world_bank_data as wb

Collecting world-bank-data
  Downloading world_bank_data-0.1.3.tar.gz (12 kB)
Building wheels for collected packages: world-bank-data
  Building wheel for world-bank-data (setup.py) ... [?25l[?25hdone
  Created wheel for world-bank-data: filename=world_bank_data-0.1.3-py3-none-any.whl size=11112 sha256=dec87b2bad999994b10bb7a4dcfea61bdd51a831c8538d012a6d4ba1640f9baa
  Stored in directory: /root/.cache/pip/wheels/95/74/5e/c32dde16dc1ef8d8e9cf134ac93ae723ffec4f60be9c4873f5
Successfully built world-bank-data
Installing collected packages: world-bank-data
Successfully installed world-bank-data-0.1.3


Calling Data on Gini coefficient from World Bank API

In [3]:
gini = pd.DataFrame(wb.get_series('SI.POV.GINI'))

In [4]:
gini

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,SI.POV.GINI
Country,Series,Year,Unnamed: 3_level_1
Africa Eastern and Southern,Gini index (World Bank estimate),1960,
Africa Eastern and Southern,Gini index (World Bank estimate),1961,
Africa Eastern and Southern,Gini index (World Bank estimate),1962,
Africa Eastern and Southern,Gini index (World Bank estimate),1963,
Africa Eastern and Southern,Gini index (World Bank estimate),1964,
...,...,...,...
Zimbabwe,Gini index (World Bank estimate),2016,
Zimbabwe,Gini index (World Bank estimate),2017,44.3
Zimbabwe,Gini index (World Bank estimate),2018,
Zimbabwe,Gini index (World Bank estimate),2019,50.3


In [5]:
gini.to_csv('gini.csv')

In [6]:
#Remove NaN values
#gini = gini.dropna()
gini = pd.read_csv('gini.csv')
gini = gini.drop('Series', axis=1)
gini

Unnamed: 0,Country,Year,SI.POV.GINI
0,Africa Eastern and Southern,1960,
1,Africa Eastern and Southern,1961,
2,Africa Eastern and Southern,1962,
3,Africa Eastern and Southern,1963,
4,Africa Eastern and Southern,1964,
...,...,...,...
16221,Zimbabwe,2016,
16222,Zimbabwe,2017,44.3
16223,Zimbabwe,2018,
16224,Zimbabwe,2019,50.3


In [7]:
#Only data from 2010 onwards
gini = gini[gini.Year > 2010]

gini = gini.dropna()

In [8]:
gini

Unnamed: 0,Country,Year,SI.POV.GINI
3102,Albania,2012,29.0
3104,Albania,2014,34.6
3105,Albania,2015,32.9
3106,Albania,2016,33.7
3107,Albania,2017,33.2
...,...,...,...
16097,"Yemen, Rep.",2014,36.7
16159,Zambia,2015,57.1
16216,Zimbabwe,2011,43.2
16222,Zimbabwe,2017,44.3


In [9]:
# remove duplicate
gini = gini.sort_values(by=['Country', 'Year'])
gini = gini.drop_duplicates(subset=['Country'], keep='last')

In [10]:
#Set index to country name
gini = gini.set_index('Country')

In [11]:
gini

Unnamed: 0_level_0,Year,SI.POV.GINI
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Albania,2017,33.2
Algeria,2011,27.6
Angola,2018,51.3
Argentina,2019,42.9
Armenia,2019,29.9
...,...,...
Vietnam,2018,35.7
West Bank and Gaza,2016,33.7
"Yemen, Rep.",2014,36.7
Zambia,2015,57.1


In [12]:
#renaming columns
gini.columns = ["a", "Gini Coefficient"]

In [13]:
#drop unwanted columns
gini = gini.drop(["a"], axis = 1)

In [14]:
gini

Unnamed: 0_level_0,Gini Coefficient
Country,Unnamed: 1_level_1
Albania,33.2
Algeria,27.6
Angola,51.3
Argentina,42.9
Armenia,29.9
...,...
Vietnam,35.7
West Bank and Gaza,33.7
"Yemen, Rep.",36.7
Zambia,57.1


In [15]:
gini.to_csv('gini_new.csv')
from google.colab import files
files.download("gini_new.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Scraping on Social Mobility Index from Wikipedia (World Economic Forum)

In [16]:
tables=pd.read_html('https://en.wikipedia.org/wiki/Global_Social_Mobility_Index')

In [17]:
tables[0]

Unnamed: 0,Rank,Country,Index Score
0,1,Denmark,85.2
1,2,Norway,83.6
2,3,Finland,83.6
3,4,Sweden,83.5
4,5,Iceland,82.7
...,...,...,...
77,78,Bangladesh,40.2
78,79,Pakistan,36.7
79,80,Cameroon,36.0
80,81,Senegal,36.0


In [18]:
sm=tables[0]
sm 

Unnamed: 0,Rank,Country,Index Score
0,1,Denmark,85.2
1,2,Norway,83.6
2,3,Finland,83.6
3,4,Sweden,83.5
4,5,Iceland,82.7
...,...,...,...
77,78,Bangladesh,40.2
78,79,Pakistan,36.7
79,80,Cameroon,36.0
80,81,Senegal,36.0


In [19]:
sm = sm.drop(['Rank'], axis=1).set_index('Country')

In [20]:
sm.to_csv('Social_Mobility.csv')
import os
os.getcwd()
from google.colab import files
files.download('Social_Mobility.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Merging the datasets

In [21]:
sm_gini = pd.merge(gini, sm, how='outer', on='Country')

In [22]:
sm_gini

Unnamed: 0_level_0,Gini Coefficient,Index Score
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Albania,33.2,55.6
Algeria,27.6,
Angola,51.3,
Argentina,42.9,57.3
Armenia,29.9,53.9
...,...,...
Russia,,64.7
Saudi Arabia,,57.1
Egypt,,44.8
Laos,,43.8


In [23]:
sm_gini = sm_gini.dropna()
sm_gini

Unnamed: 0_level_0,Gini Coefficient,Index Score
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Albania,33.2,55.6
Argentina,42.9,57.3
Armenia,29.9,53.9
Australia,34.4,75.1
Austria,30.8,80.1
...,...,...
Ukraine,26.6,61.2
United Kingdom,35.1,74.4
United States,41.4,70.4
Uruguay,39.7,67.1


In [24]:
sm_gini.to_csv("sm_gini.csv")

In [25]:
from google.colab import files

In [26]:
files.download("sm_gini.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>