In [1]:
## Dependencies

import pandas as pd
import pathlib as path

import requests
import json
from pprint import pprint

import numpy as np

from io import StringIO

# to read the ZIP file
import requests
import zipfile
import io



# (1) create DF for countries

#### information will be used to merge or generate vizualisations
source : https://api.worldbank.org/v2/country?format=json

In [2]:
## retrieve all the countries from world bank API

### generate DF Countries

page = 1   ## there are 6 pages in the json file
data = []
url_countries = 'https://api.worldbank.org/v2/country?format=json'

for page in range(1,7):
    response = requests.get(f"{url_countries}&page={page}")
    json_data = response.json()
    
    # Add the data from the current page
    data.extend(json_data[1])  
    
## create DF and select columns

countries_wb_df = pd.DataFrame(data)
countries_wb_df['region'] = countries_wb_df['region'].apply(lambda x: x['value'])
countries_wb_df = countries_wb_df[['id', 'iso2Code', 'name', 'region','capitalCity', 'longitude', 'latitude']]
countries_wb_df = countries_wb_df.rename(columns={
                                                    'name':'country',
                                                    'id':'iso3Code'
                                                    })


regions = (countries_wb_df['region'] != "Aggregates")
countries_wb_df = countries_wb_df[regions]


countries_wb_df.head()

Unnamed: 0,iso3Code,iso2Code,country,region,capitalCity,longitude,latitude
0,ABW,AW,Aruba,Latin America & Caribbean,Oranjestad,-70.0167,12.5167
2,AFG,AF,Afghanistan,South Asia,Kabul,69.1761,34.5228
5,AGO,AO,Angola,Sub-Saharan Africa,Luanda,13.242,-8.81155
6,ALB,AL,Albania,Europe & Central Asia,Tirane,19.8172,41.3317
7,AND,AD,Andorra,Europe & Central Asia,Andorra la Vella,1.5218,42.5075


In [10]:
# countries_wb_df = countries_wb_df.set_index("index")

In [7]:
### Export to csv file in output

countries_wb_df.to_csv('../Output/countries_UN_referential.csv', index=False)


# (2) collect immigration data 

##### Data source from IRCC web site
this file is updated on a monthly basis by IRCC

https://www.ircc.canada.ca/opendata-donneesouvertes/data/ODP-PR-Citz.csv
from
https://ouvert.canada.ca/data/fr/dataset/f7e5498e-0ad8-4417-85c9-9b8aff9b9eda/resource/5f5fa9ca-b621-4dac-91d1-25654a25735c

In [None]:
### import and read csv file from web page

path_imm = 'https://www.ircc.canada.ca/opendata-donneesouvertes/data/ODP-PR-Citz.csv'
imm_df = pd.read_csv(path_imm, delimiter='\t')

# imm_df.to_csv('../Output/immigration_raw_data.csv')
imm_df.head(5)

Unnamed: 0,EN_YEAR,EN_QUARTER,EN_MONTH,FR_ANNEÉ,FR_TRIMESTRE,FR_MOIS,EN_COUNTRY_OF_CITIZENSHIP,FR_PAYS_DE_CITOYENNETÉ,TOTAL
0,2015,Q1,Feb,2015,T1,fév.,Afghanistan,Afghanistan,125
1,2015,Q1,Feb,2015,T1,fév.,Albania,Albanie,25
2,2015,Q1,Feb,2015,T1,fév.,Algeria,Algérie,125
3,2015,Q1,Feb,2015,T1,fév.,Antigua and Barbuda,Antigua-et-Barbuda,5
4,2015,Q1,Feb,2015,T1,fév.,Argentina,Argentine,15


In [23]:
# drop useless columns

imm_df = imm_df.drop(columns=['FR_ANNEÉ','FR_TRIMESTRE','FR_MOIS','FR_PAYS_DE_CITOYENNETÉ'])
imm_df.head(5)

Unnamed: 0,EN_YEAR,EN_QUARTER,EN_MONTH,EN_COUNTRY_OF_CITIZENSHIP,TOTAL
0,2015,Q1,Feb,Afghanistan,125
1,2015,Q1,Feb,Albania,25
2,2015,Q1,Feb,Algeria,125
3,2015,Q1,Feb,Antigua and Barbuda,5
4,2015,Q1,Feb,Argentina,15


In [24]:
# check empty values with 1 example raw 42

print(imm_df.loc[42,])

EN_YEAR                          2015
EN_QUARTER                         Q1
EN_MONTH                          Feb
EN_COUNTRY_OF_CITIZENSHIP    Dominica
TOTAL                              --
Name: 42, dtype: object


In [25]:
imm_clean_df = imm_df.copy()

In [26]:
# convert Total to numeric

imm_clean_df['TOTAL'] = pd.to_numeric(imm_clean_df['TOTAL'], errors='coerce')

# replace NAN with 0 and change data type

imm_clean_df = imm_clean_df.fillna({'TOTAL': 0})
imm_clean_df = imm_clean_df.astype({'TOTAL': 'int64'})
print(imm_clean_df.loc[42,])

EN_YEAR                          2015
EN_QUARTER                         Q1
EN_MONTH                          Feb
EN_COUNTRY_OF_CITIZENSHIP    Dominica
TOTAL                               0
Name: 42, dtype: object


In [27]:
# check that total is numeric _ integer

imm_clean_df.dtypes

EN_YEAR                       int64
EN_QUARTER                   object
EN_MONTH                     object
EN_COUNTRY_OF_CITIZENSHIP    object
TOTAL                         int64
dtype: object

In [28]:
# rename columns 

imm_clean_df = imm_clean_df.rename(columns={
                                            'EN_YEAR'            :           'year' ,
                                            'EN_QUARTER'         :          'quarter' , 
                                            'EN_MONTH'           :          'month_str' ,
                                            'EN_COUNTRY_OF_CITIZENSHIP' :   'country' ,  
                                            'TOTAL'              :           'immigration_flow'
                                            })
imm_clean_df.dtypes

year                 int64
quarter             object
month_str           object
country             object
immigration_flow     int64
dtype: object

#### 2.1 the steps below are needed to match the name of the countries between this DF and countries_df

example : IRCC name is 'Bahama Islands, The'  
for world bank :   'Bahamas, The' ,
because we will use country as column to merge, we need to harmonize the countries names

1st we identify the countries names that mismatch
then we change them
the preparation of the mapping is done manually

In [29]:
# to identify the countries names that mismatch between IRCC and World bank

countries_wb = pd.DataFrame(countries_wb_df['country'].unique() , columns=['country'])
countries_ircc = pd.DataFrame(imm_clean_df['country'].unique(),  columns=['country'])

# Add a source column to indicate the origin
countries_wb['source'] = 'countries_wb'
countries_ircc['source'] = 'countries_ircc'

# Concatenate the two DataFrames
all_countries = pd.concat([countries_wb, countries_ircc], ignore_index=True)

# Find non-corresponding values
non_corresponding_values = all_countries.groupby('country').filter(lambda x: len(x) == 1)

# Display the non-corresponding values with their origin
print(non_corresponding_values)
# non_corresponding_values.to_csv('Output/non_corresponding_countries.csv')

            country          source
0             Aruba    countries_wb
8    American Samoa    countries_wb
15            Benin    countries_wb
16     Burkina Faso    countries_wb
20     Bahamas, The    countries_wb
..              ...             ...
423        Holy See  countries_ircc
424          Azores  countries_ircc
427      Guadeloupe  countries_ircc
431         Reunion  countries_ircc
432    Sint-Maarten  countries_ircc

[158 rows x 2 columns]


In [30]:
# Change countries name to match to th United Nations standard

imm_clean_df['country'] = imm_clean_df['country'].replace({
                            
                                    'Azores'  :  'Canada' ,
                                    'Bahama Islands, The'  :  'Bahamas, The' ,
                                    'Benin, Republic of'  :  'Benin' ,
                                    'Bosnia-Herzegovina'  :  'Bosnia and Herzegovina' ,
                                    'Botswana, Republic of'  :  'Botswana' ,
                                    'Brunei'  :  'Brunei Darussalam' ,
                                    'Burkina-Faso'  :  'Burkina Faso' ,
                                    'Cameroon, Federal Republic of'  :  'Cameroon' ,
                                    'Cape Verde Islands'  :  'Cabo Verde' ,
                                    'Chad, Republic of'  :  'Chad' ,
                                    "China, People's Republic of"  :  'China' ,
                                    'Congo, Democratic Republic of the'  :  'Congo, Dem. Rep.' ,
                                    "Congo, People's Republic of the"  :  'Congo, Rep.' ,
                                    'Country not stated'  :  'Canada' ,
                                    'Czech Republic'  :  'Czechia' ,
                                    'Djibouti, Republic of'  :  'Djibouti' ,
                                    'East Timor, Democratic Republic of'  :  'Timor-Leste' ,
                                    'Egypt'  :  'Egypt, Arab Rep.' ,
                                    'Equatorial Guinea, Republic of'  :  'Equatorial Guinea' ,
                                    'Gabon Republic'  :  'Gabon' ,
                                    'Gambia'  :  'Gambia, The' ,
                                    'Guadeloupe'  :  'France' ,
                                    'Guinea, Republic of'  :  'Guinea' ,
                                    'Holy See'  :  'Canada' ,
                                    'Hong Kong SAR'  :  'Hong Kong SAR, China' ,
                                    'Indonesia, Republic of'  :  'Indonesia' ,
                                    'Iran'  :  'Iran, Islamic Rep.' ,
                                    'Ireland, Republic of'  :  'Ireland' ,
                                    'Ivory Coast, Republic of'  :  "Cote d'Ivoire" ,
                                    "Korea, People's Democratic Republic of"  :  "Korea, Dem. People's Rep." ,
                                    'Korea, Republic of'  :  'Korea, Rep.' ,
                                    'Kosovo, Republic of'  :  'Kosovo' ,
                                    'Kyrgyzstan'  :  'Kyrgyz Republic' ,
                                    'Laos'  :  'Lao PDR' ,
                                    'Macau SAR'  :  'Macao SAR, China' ,
                                    'Macedonia'  :  'North Macedonia' ,
                                    'Maldives, Republic of'  :  'Maldives' ,
                                    'Mali, Republic of'  :  'Mali' ,
                                    'Marshall Islands, Republic of the'  :  'Marshall Islands' ,
                                    'Micronesia, Federated States of'  :  'Canada' ,
                                    "Mongolia, People's Republic of"  :  'Mongolia' ,
                                    'Montenegro, Republic of'  :  'Montenegro' ,
                                    'Myanmar (Burma)'  :  'Myanmar' ,
                                    'Netherlands Antilles, The'  :  'Canada' ,
                                    'Netherlands, The'  :  'Netherlands' ,
                                    'Nevis'  :  'Canada' ,
                                    'Niger, Republic of the'  :  'Niger' ,
                                    'Northern Mariana Islands, Commonwealth of the'  :  'Northern Mariana Islands' ,
                                    'Other'  :  'Canada' ,
                                    'Palau, Republic of'  :  'Palau' ,
                                    'Palestinian Authority (Gaza/West Bank)'  :  'West Bank and Gaza' ,
                                    'Panama, Republic of'  :  'Panama' ,
                                    'Reunion'  :  'France' ,
                                    'Russia'  :  'Russian Federation' ,
                                    'Samoa, American'  :  'Samoa' ,
                                    'Samoa, Independent State of'  :  'American Samoa' ,
                                    'Serbia, Republic of'  :  'Serbia' ,
                                    'Sint-Maarten'  :  'Sint Maarten (Dutch part)' ,
                                    'Somalia, Democratic Republic of'  :  'Somalia' ,
                                    'South Africa, Republic of'  :  'South Africa' ,
                                    'South Sudan, Republic of'  :  'South Sudan' ,
                                    'St. Kitts-Nevis'  :  'Canada' ,
                                    'Stateless'  :  'Canada' ,
                                    'Sudan, Democratic Republic of'  :  'Sudan' ,
                                    'Surinam'  :  'Suriname' ,
                                    'Swaziland'  :  'Somalia' ,
                                    'Syria'  :  'Syrian Arab Republic' ,
                                    'Taiwan'  :  'China' ,
                                    'Tanzania, United Republic of'  :  'Tanzania' ,
                                    'Togo, Republic of'  :  'Togo' ,
                                    'Trinidad and Tobago, Republic of'  :  'Trinidad and Tobago' ,
                                    'Turkey'  :  'Turkiye' ,
                                    'United Kingdom and Overseas Territories'  :  'United Kingdom' ,
                                    'United States of America'  :  'United States' ,
                                    'Venezuela'  :  'Venezuela, RB' ,
                                    'Vietnam'  :  'Viet Nam' ,
                                    'Virgin Islands, British'  :  'British Virgin Islands' ,
                                    'Western Sahara'  :  'Canada' ,
                                    'Yemen'  :  'Yemen, Rep.' ,


                                     })



In [31]:
# chech if the modification is effiscinet : we should have only countries_wb source

countries_wb = pd.DataFrame(countries_wb_df['country'].unique() , columns=['country'])
countries_ircc = pd.DataFrame(imm_clean_df['country'].unique(),  columns=['country'])

# Add a source column to indicate the origin
countries_wb['source'] = 'countries_wb'
countries_ircc['source'] = 'countries_ircc'

# Concatenate the two DataFrames
all_countries = pd.concat([countries_wb, countries_ircc], ignore_index=True)

# Find non-corresponding values
non_corresponding_values = all_countries.groupby('country').filter(lambda x: len(x) == 1)

# Display the non-corresponding values with their origin
print(non_corresponding_values)  ## you must have only countries_wb source


                      country        source
0                       Aruba  countries_wb
34            Channel Islands  countries_wb
47             Cayman Islands  countries_wb
65              Faroe Islands  countries_wb
66      Micronesia, Fed. Sts.  countries_wb
71                  Gibraltar  countries_wb
78                  Greenland  countries_wb
80                       Guam  countries_wb
88                Isle of Man  countries_wb
104       St. Kitts and Nevis  countries_wb
119  St. Martin (French part)  countries_wb
183                  Eswatini  countries_wb
208     Virgin Islands (U.S.)  countries_wb


#### 2.2 other cleaning steps

In [32]:
# add months with numerical values
 ## check that there is no strange value 

months_str = imm_clean_df['month_str'].unique()
months_str

array(['Feb', 'Jan', 'Mar', 'Apr', 'Jun', 'May', 'Aug', 'Jul', 'Sep',
       'Dec', 'Nov', 'Oct'], dtype=object)

In [33]:
# use of Mapping to add integers to month abbreviations

month_mapping = {
    'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
    'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
}

# Map month abbreviations to integers using the dictionary
imm_clean_df['month_int'] = imm_clean_df['month_str'].map(month_mapping)

imm_clean_df.head(5)

Unnamed: 0,year,quarter,month_str,country,immigration_flow,month_int
0,2015,Q1,Feb,Afghanistan,125,2
1,2015,Q1,Feb,Albania,25,2
2,2015,Q1,Feb,Algeria,125,2
3,2015,Q1,Feb,Antigua and Barbuda,5,2
4,2015,Q1,Feb,Argentina,15,2


In [34]:
# organize columns

imm_clean_df = imm_clean_df[['country', 'year', 'month_str', 
                            'month_int','quarter', 'immigration_flow']]
imm_clean_df.head(5)

Unnamed: 0,country,year,month_str,month_int,quarter,immigration_flow
0,Afghanistan,2015,Feb,2,Q1,125
1,Albania,2015,Feb,2,Q1,25
2,Algeria,2015,Feb,2,Q1,125
3,Antigua and Barbuda,2015,Feb,2,Q1,5
4,Argentina,2015,Feb,2,Q1,15


In [35]:
# this step is usefull to set a key during the DB building

imm_new_df = imm_clean_df.reset_index()
imm_new_df.head(5)

Unnamed: 0,index,country,year,month_str,month_int,quarter,immigration_flow
0,0,Afghanistan,2015,Feb,2,Q1,125
1,1,Albania,2015,Feb,2,Q1,25
2,2,Algeria,2015,Feb,2,Q1,125
3,3,Antigua and Barbuda,2015,Feb,2,Q1,5
4,4,Argentina,2015,Feb,2,Q1,15


In [36]:
# export to CSV

imm_new_df.to_csv('../Output/immigrants_by_country_monthly.csv', index=False)

# (3) Extract macro economic data
#### from world bank API
source : https://api.worldbank.org/v2/country/DZA/indicator/NY.GDP.PCAP.KD?date=2015:2024&format=json

to request the API, we need to customize
- the country using the iso3code : fo example Afghanistan = AFG
- define the indicator : there are more than 1480 indicators, a selection is done (manually or using an additional csv file filled manually to be filtred)
- 1st year and last year

because we need all the countries, the number of end points will be : number of countries x number of indicators x number of years

In [37]:
# the API uses the iso3Code code of the country (example PAK for pakistan, AFG for Afghanistan)
# we need to identify the needed countries from from our immigration_df and retrieve these iso3Code from countries_df

countries_request = pd.merge(imm_new_df, countries_wb_df, left_on='country', right_on='country')
countries_request

Unnamed: 0,index,country,year,month_str,month_int,quarter,immigration_flow,iso3Code,iso2Code,region,capitalCity,longitude,latitude
0,0,Afghanistan,2015,Feb,2,Q1,125,AFG,AF,South Asia,Kabul,69.1761,34.5228
1,1,Albania,2015,Feb,2,Q1,25,ALB,AL,Europe & Central Asia,Tirane,19.8172,41.3317
2,2,Algeria,2015,Feb,2,Q1,125,DZA,DZ,Middle East & North Africa,Algiers,3.05097,36.7397
3,3,Antigua and Barbuda,2015,Feb,2,Q1,5,ATG,AG,Latin America & Caribbean,Saint John's,-61.8456,17.1175
4,4,Argentina,2015,Feb,2,Q1,15,ARG,AR,Latin America & Caribbean,Buenos Aires,-58.4173,-34.6118
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20036,20036,"Venezuela, RB",2024,Oct,10,Q4,200,VEN,VE,Latin America & Caribbean,Caracas,-69.8371,9.08165
20037,20037,Viet Nam,2024,Oct,10,Q4,365,VNM,VN,East Asia & Pacific,Hanoi,105.825,21.0069
20038,20038,"Yemen, Rep.",2024,Oct,10,Q4,35,YEM,YE,Middle East & North Africa,Sana'a,44.2075,15.352
20039,20039,Zambia,2024,Oct,10,Q4,0,ZMB,ZM,Sub-Saharan Africa,Lusaka,28.2937,-15.3982


In [125]:
# extrac the needed iso3code in a list to be used to iterate

country_list = countries_request['iso3Code'].unique().tolist()
country_list[0:5]

['AFG', 'ALB', 'DZA', 'ATG', 'ARG']

In [None]:
# this line is just to check that we are connected to the API : O TEST before loop

link_pop = f'https://api.worldbank.org/v2/country/DZA/indicator/NY.GDP.PCAP.KD?date=2015:2024&format=json'
response_pop = requests.get(link_pop).json()

pprint(response_pop)

[{'lastupdated': '2025-01-28',
  'page': 1,
  'pages': 1,
  'per_page': 50,
  'sourceid': '2',
  'total': 9},
 [{'country': {'id': 'DZ', 'value': 'Algeria'},
   'countryiso3code': 'DZA',
   'date': '2023',
   'decimal': 1,
   'indicator': {'id': 'NY.GDP.PCAP.KD',
                 'value': 'GDP per capita (constant 2015 US$)'},
   'obs_status': '',
   'unit': '',
   'value': 4660.40545686886},
  {'country': {'id': 'DZ', 'value': 'Algeria'},
   'countryiso3code': 'DZA',
   'date': '2022',
   'decimal': 1,
   'indicator': {'id': 'NY.GDP.PCAP.KD',
                 'value': 'GDP per capita (constant 2015 US$)'},
   'obs_status': '',
   'unit': '',
   'value': 4544.46688059647},
  {'country': {'id': 'DZ', 'value': 'Algeria'},
   'countryiso3code': 'DZA',
   'date': '2021',
   'decimal': 1,
   'indicator': {'id': 'NY.GDP.PCAP.KD',
                 'value': 'GDP per capita (constant 2015 US$)'},
   'obs_status': '',
   'unit': '',
   'value': 4456.74687641147},
  {'country': {'id': 'DZ', 'valu

In [40]:
# this cell is used to check our code before to launch the loop 
# TO TEST before loop


link_pop = f'https://api.worldbank.org/v2/country/DZA/indicator/NY.GDP.PCAP.KD?date=2015:2024&format=json'
response_pop = requests.get(link_pop).json()

# pprint(response_pop)

pop_list = []
for item in response_pop[1]:  # Assuming response_pop[1] is a list of dictionaries
    pop_list.append({
        'iso3Code': item['countryiso3code'],
        'iso2Code': item['country']['id'],
        'Country': item['country']['value'],              
        'year': item['date'],
        'indicator': item['indicator']['value'],
        'value': item['value']
    })

pop_list

[{'iso3Code': 'DZA',
  'iso2Code': 'DZ',
  'Country': 'Algeria',
  'year': '2023',
  'indicator': 'GDP per capita (constant 2015 US$)',
  'value': 4660.40545686886},
 {'iso3Code': 'DZA',
  'iso2Code': 'DZ',
  'Country': 'Algeria',
  'year': '2022',
  'indicator': 'GDP per capita (constant 2015 US$)',
  'value': 4544.46688059647},
 {'iso3Code': 'DZA',
  'iso2Code': 'DZ',
  'Country': 'Algeria',
  'year': '2021',
  'indicator': 'GDP per capita (constant 2015 US$)',
  'value': 4456.74687641147},
 {'iso3Code': 'DZA',
  'iso2Code': 'DZ',
  'Country': 'Algeria',
  'year': '2020',
  'indicator': 'GDP per capita (constant 2015 US$)',
  'value': 4363.68533766912},
 {'iso3Code': 'DZA',
  'iso2Code': 'DZ',
  'Country': 'Algeria',
  'year': '2019',
  'indicator': 'GDP per capita (constant 2015 US$)',
  'value': 4672.66408682198},
 {'iso3Code': 'DZA',
  'iso2Code': 'DZ',
  'Country': 'Algeria',
  'year': '2018',
  'indicator': 'GDP per capita (constant 2015 US$)',
  'value': 4717.00358853556},
 {'i

In [None]:
# DEFINE THE VARIABLES : INDICATORS, TIME PERIOD

# define 1st and last year for the loop

year_min = 2015
year_max = 2024
# years_list = [x for x in range(year_min, year_max +1)]   # COULD BE USED BUT NOT IN THE CURRENT CODE


In [None]:
# DEFINE THE VARIABLES : INDICATORS, TIME PERIOD
# create the list of indicators

indicators = [
                'AG.LND.AGRI.ZS'  ,
                'AG.LND.PRCP.MM'  ,
                'ER.H2O.FWST.ZS'  ,
                'CC.EST'  ,
                'IC.REG.COST.PC.ZS'  ,
                'IC.BUS.DFRN.XQ'  ,
                'IC.FRM.BNKS.ZS'  ,
                'IC.REG.DURS'  ,
                'SL.UEM.ADVN.ZS'  ,
                'SL.UEM.BASC.ZS'  ,
                'SL.UEM.INTM.ZS'  ,
                'SL.UEM.TOTL.ZS'  ,
                'EG.CFT.ACCS.ZS'  ,
                'EG.ELC.ACCS.ZS'  ,
                'SP.DYN.CBRT.IN'  ,
                'SP.DYN.CDRT.IN'  ,
                'SP.DYN.TFRT.IN'  ,
                'HD.HCI.OVRL'  ,
                'SP.DYN.LE00.IN'  ,
                'EN.POP.DNST'  ,
                'SP.POP.GROW'  ,
                'EN.POP.EL5M.ZS'  ,
                'EN.POP.SLUM.UR.ZS'  ,
                'SP.POP.TOTL'  ,
                'SI.POV.DDAY'  ,
                'SI.POV.LMIC'  ,
                'SI.POV.UMIC'  ,
                'SI.POV.NAHC'  ,
                'SH.ALC.PCAP.LI'  ,
                'MS.MIL.TOTL.TF.ZS'  ,
                'GC.DOD.TOTL.GD.ZS'  ,
                'FP.CPI.TOTL'  ,
                'BN.CAB.XOKA.GD.ZS'  ,
                'SH.XPD.CHEX.GD.ZS'  ,
                'DT.TDS.DPPF.XP.ZS'  ,
                'NY.GDP.MKTP.KD.ZG'  ,
                'NY.GDP.PCAP.KD'  ,
                'NY.GDP.PCAP.PP.KD'  ,
                'NE.CON.GOVT.ZS'  ,
                'NY.GNP.PCAP.KD'  ,
                'SE.XPD.TOTL.GD.ZS'  ,
                'NV.IND.TOTL.ZS'  ,
                'FP.CPI.TOTL.ZG'  ,
                'MS.MIL.XPND.GD.ZS'  ,
                'DT.TDS.DECT.EX.ZS'  ,
                'DT.TDS.DECT.GN.ZS'  ,
                
                ]

print(f'Number of indicators is : {len(indicators)}')

Number of indicators is : 46


In [None]:
# lOOP to extract data from API : 20 indicators (between 15 and 18 minutes), 46 indicators (between 50 and 60 minutes)

# this code will loop through all the listed countries and the needed indicators to generate a list

print(f"start code")
print('---------------------------')

macro_eco_data_list = []

# the end point is defined by (i) country (II) indicator (III) year start and year end
        # => we need to create 1 end point of each country / indicator

for i in country_list :

    for j in indicators :

        print(f"--------------------------")
        print(f"country {i} indicator {j}")

        try:

            # define the end point by country (from list I) and indicator (from list J)

            link_pop = f'https://api.worldbank.org/v2/country/{i}/indicator/{j}?date={year_min}:{year_max}&format=json'
            response_pop = requests.get(link_pop).json()

            # loop through the response_pop (list) to extract the value for each year in the serie 

            for item in response_pop[1]:  
                macro_eco_data_list.append({
                    'iso3Code': item['countryiso3code'],
                    'iso2Code': item['country']['id'],
                    'country': item['country']['value'],              
                    'year': item['date'],
                    'indicator': item['indicator']['value'],
                    'value': item['value']
                })
        except:   
            print(f"country {i} indicator {j}....... not found")

start code
---------------------------
--------------------------
country AFG indicator AG.LND.AGRI.ZS
--------------------------
country AFG indicator AG.LND.PRCP.MM
--------------------------
country AFG indicator ER.H2O.FWST.ZS
--------------------------
country AFG indicator CC.EST
--------------------------
country AFG indicator IC.REG.COST.PC.ZS
--------------------------
country AFG indicator IC.BUS.DFRN.XQ
--------------------------
country AFG indicator IC.FRM.BNKS.ZS
--------------------------
country AFG indicator IC.REG.DURS
--------------------------
country AFG indicator SL.UEM.ADVN.ZS
--------------------------
country AFG indicator SL.UEM.BASC.ZS
--------------------------
country AFG indicator SL.UEM.INTM.ZS
--------------------------
country AFG indicator SL.UEM.TOTL.ZS
--------------------------
country AFG indicator EG.CFT.ACCS.ZS
--------------------------
country AFG indicator EG.ELC.ACCS.ZS
--------------------------
country AFG indicator SP.DYN.CBRT.IN
---------

In [None]:
## Create DF with the previous output

raw_macrodata_df = pd.DataFrame(macro_eco_data_list)

raw_macrodata_df.head()

Unnamed: 0,iso3Code,iso2Code,country,year,indicator,value
0,AFG,AF,Afghanistan,2023,Agricultural land (% of land area),
1,AFG,AF,Afghanistan,2022,Agricultural land (% of land area),58.741548
2,AFG,AF,Afghanistan,2021,Agricultural land (% of land area),58.741548
3,AFG,AF,Afghanistan,2020,Agricultural land (% of land area),58.741548
4,AFG,AF,Afghanistan,2019,Agricultural land (% of land area),58.276988


### 3.1 the next steps will replace each nan value with the previous value available
to do : 
- we pivot the DF with years as column
- we replace 2015 nan values with 0
- we replace all other nan values with the previous value (using fillna(method='ffill', axis=1))

In [88]:
raw_macrodata_df.shape

(83835, 6)

In [79]:
temp_df = raw_macrodata_df.pivot_table(index=['indicator', 'country','iso3Code','iso2Code'], columns='year', values='value')
temp_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,year,2015,2016,2017,2018,2019,2020,2021,2022,2023
indicator,country,iso3Code,iso2Code,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Access to clean fuels and technologies for cooking (% of population),Afghanistan,AFG,AF,27.6,28.8,30.3,31.4,32.6,33.8,34.9,36.1,
Access to clean fuels and technologies for cooking (% of population),Albania,ALB,AL,76.6,78.3,79.6,80.7,82.0,83.1,83.6,84.6,
Access to clean fuels and technologies for cooking (% of population),Algeria,DZA,DZ,99.6,99.6,99.7,99.7,99.7,99.7,99.7,99.7,
Access to clean fuels and technologies for cooking (% of population),Andorra,AND,AD,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,
Access to clean fuels and technologies for cooking (% of population),Angola,AGO,AO,47.3,47.8,48.3,48.6,49.0,49.7,49.8,50.0,


In [89]:
temp_pivot = temp_df.fillna({'2015' : 0})
temp_pivot.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,year,2015,2016,2017,2018,2019,2020,2021,2022,2023
indicator,country,iso3Code,iso2Code,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Access to clean fuels and technologies for cooking (% of population),Afghanistan,AFG,AF,27.6,28.8,30.3,31.4,32.6,33.8,34.9,36.1,
Access to clean fuels and technologies for cooking (% of population),Albania,ALB,AL,76.6,78.3,79.6,80.7,82.0,83.1,83.6,84.6,
Access to clean fuels and technologies for cooking (% of population),Algeria,DZA,DZ,99.6,99.6,99.7,99.7,99.7,99.7,99.7,99.7,
Access to clean fuels and technologies for cooking (% of population),Andorra,AND,AD,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,
Access to clean fuels and technologies for cooking (% of population),Angola,AGO,AO,47.3,47.8,48.3,48.6,49.0,49.7,49.8,50.0,


In [90]:
temp_pivot = temp_pivot.fillna(method='ffill', axis=1)
temp_pivot.head()

  temp_pivot = temp_pivot.fillna(method='ffill', axis=1)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,year,2015,2016,2017,2018,2019,2020,2021,2022,2023
indicator,country,iso3Code,iso2Code,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Access to clean fuels and technologies for cooking (% of population),Afghanistan,AFG,AF,27.6,28.8,30.3,31.4,32.6,33.8,34.9,36.1,36.1
Access to clean fuels and technologies for cooking (% of population),Albania,ALB,AL,76.6,78.3,79.6,80.7,82.0,83.1,83.6,84.6,84.6
Access to clean fuels and technologies for cooking (% of population),Algeria,DZA,DZ,99.6,99.6,99.7,99.7,99.7,99.7,99.7,99.7,99.7
Access to clean fuels and technologies for cooking (% of population),Andorra,AND,AD,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
Access to clean fuels and technologies for cooking (% of population),Angola,AGO,AO,47.3,47.8,48.3,48.6,49.0,49.7,49.8,50.0,50.0


In [92]:
temp_pivot = temp_pivot.reset_index()
temp_pivot.columns

Index(['indicator', 'country', 'iso3Code', 'iso2Code', '2015', '2016', '2017',
       '2018', '2019', '2020', '2021', '2022', '2023'],
      dtype='object', name='year')

In [93]:
temp_pivot.head()

year,indicator,country,iso3Code,iso2Code,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Access to clean fuels and technologies for coo...,Afghanistan,AFG,AF,27.6,28.8,30.3,31.4,32.6,33.8,34.9,36.1,36.1
1,Access to clean fuels and technologies for coo...,Albania,ALB,AL,76.6,78.3,79.6,80.7,82.0,83.1,83.6,84.6,84.6
2,Access to clean fuels and technologies for coo...,Algeria,DZA,DZ,99.6,99.6,99.7,99.7,99.7,99.7,99.7,99.7,99.7
3,Access to clean fuels and technologies for coo...,Andorra,AND,AD,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
4,Access to clean fuels and technologies for coo...,Angola,AGO,AO,47.3,47.8,48.3,48.6,49.0,49.7,49.8,50.0,50.0


In [94]:
macro_eco_data_df = pd.melt(temp_pivot, id_vars=['indicator', 'country','iso3Code','iso2Code'], var_name='year', value_name='value' )
macro_eco_data_df

Unnamed: 0,indicator,country,iso3Code,iso2Code,year,value
0,Access to clean fuels and technologies for coo...,Afghanistan,AFG,AF,2015,27.600
1,Access to clean fuels and technologies for coo...,Albania,ALB,AL,2015,76.600
2,Access to clean fuels and technologies for coo...,Algeria,DZA,DZ,2015,99.600
3,Access to clean fuels and technologies for coo...,Andorra,AND,AD,2015,100.000
4,Access to clean fuels and technologies for coo...,Angola,AGO,AO,2015,47.300
...,...,...,...,...,...,...
71626,"Unemployment, total (% of total labor force) (...",Viet Nam,VNM,VN,2023,1.645
71627,"Unemployment, total (% of total labor force) (...",West Bank and Gaza,PSE,PS,2023,24.420
71628,"Unemployment, total (% of total labor force) (...","Yemen, Rep.",YEM,YE,2023,17.091
71629,"Unemployment, total (% of total labor force) (...",Zambia,ZMB,ZM,2023,5.905


In [95]:
print(f'size of the DF : {macro_eco_data_df.shape}')
print('................')
print(macro_eco_data_df.dtypes)

size of the DF : (71631, 6)
................
indicator     object
country       object
iso3Code      object
iso2Code      object
year          object
value        float64
dtype: object


In [96]:
# change datatype 

macro_eco_data_df = macro_eco_data_df.astype({'year':'int64'})
macro_eco_data_df.dtypes

indicator     object
country       object
iso3Code      object
iso2Code      object
year           int64
value        float64
dtype: object

In [100]:
macro_eco_data_df = macro_eco_data_df.reset_index()
macro_eco_data_df.head()

Unnamed: 0,index,indicator,country,iso3Code,iso2Code,year,value
0,0,Access to clean fuels and technologies for coo...,Afghanistan,AFG,AF,2015,27.6
1,1,Access to clean fuels and technologies for coo...,Albania,ALB,AL,2015,76.6
2,2,Access to clean fuels and technologies for coo...,Algeria,DZA,DZ,2015,99.6
3,3,Access to clean fuels and technologies for coo...,Andorra,AND,AD,2015,100.0
4,4,Access to clean fuels and technologies for coo...,Angola,AGO,AO,2015,47.3


In [None]:
# export to csv

macro_eco_data_df.to_csv('../Output/macro_economic_data.csv', index=False)

# (4) Extract Corruption perception indicator


source : https://images.transparencycdn.org/images/CPI2023_FullDataSet.zip

from webpage : https://www.transparency.org/en/cpi/2023/media-kit

extract sheet : CPI 2023

In [14]:
url = "https://images.transparencycdn.org/images/CPI2023_FullDataSet.zip"

# Download the ZIP file (code from Chat GPT)

response = requests.get(url)
if response.status_code == 200:

    # Step 2: Open the ZIP file from memory
    with zipfile.ZipFile(io.BytesIO(response.content), "r") as zip_ref:
        # Extract all files to a folder
        zip_ref.extractall("CPI2023_Data")
        print("Files extracted successfully!")

        # Step 3: Identify the Excel file (assuming there's only one)
        excel_file = [f for f in zip_ref.namelist() if f.endswith(".xlsx")][0]
        excel_path = f"CPI2023_Data/{excel_file}"

        # Step 4: Read the "CPI 2023" sheet into a DataFrame and starting from row 4
        cpi_df = pd.read_excel(excel_path, sheet_name="CPI 2023", skiprows=3)
        print("Sheet 'CPI 2023' loaded into a DataFrame successfully!")

        # Display the first few rows
        cpi_df.head()
else:
    print("Failed to download the file. Check the URL.")


Files extracted successfully!
Sheet 'CPI 2023' loaded into a DataFrame successfully!


In [15]:
cpi_df.head()

Unnamed: 0,Country / Territory,ISO3,Region,CPI score 2023,Rank,Standard error,Number of sources,Lower CI,Upper CI,African Development Bank CPIA,...,Economist Intelligence Unit Country Ratings,Freedom House Nations in Transit,Global Insights Country Risk Ratings,IMD World Competitiveness Yearbook,PERC Asia Risk Guide,PRS International Country Risk Guide,Varieties of Democracy Project,World Bank CPIA,World Economic Forum EOS,World Justice Project Rule of Law Index
0,Denmark,DNK,WE/EU,90,1,1.819265,8,87.0164,92.9836,,...,89.67447,,83.25637,97.89119,,100.0,78.89152,,82.20667,87.68097
1,Finland,FIN,WE/EU,87,2,1.057962,8,85.26495,88.73505,,...,89.67447,,83.25637,89.84466,,93.38018,78.16811,,86.76612,85.88441
2,New Zealand,NZL,AP,85,3,1.916107,8,81.85758,88.14242,,...,89.67447,,83.25637,87.10542,,93.38018,78.16811,,70.98344,82.2913
3,Norway,NOR,WE/EU,84,4,1.380803,7,81.73548,86.26452,,...,89.67447,,83.25637,75.72045,,84.67139,78.16811,,,86.78268
4,Singapore,SGP,AP,83,5,1.145734,9,81.12099,84.87901,,...,89.67447,,83.25637,76.74767,89.29896,83.27798,78.16811,,88.51975,84.08785


In [16]:
cpi_df.columns

Index(['Country / Territory', 'ISO3', 'Region', 'CPI score 2023', 'Rank',
       'Standard error', 'Number of sources', 'Lower CI', 'Upper CI',
       'African Development Bank CPIA',
       'Bertelsmann Foundation Sustainable Governance Index',
       'Bertelsmann Foundation Transformation Index',
       'Economist Intelligence Unit Country Ratings',
       'Freedom House Nations in Transit',
       'Global Insights Country Risk Ratings',
       'IMD World Competitiveness Yearbook', 'PERC Asia Risk Guide',
       'PRS International Country Risk Guide',
       'Varieties of Democracy Project', 'World Bank CPIA',
       'World Economic Forum EOS', 'World Justice Project Rule of Law Index'],
      dtype='object')

In [17]:
cpi_df = cpi_df.rename(columns={
                                'Country / Territory' : 'country' , 
                                'ISO3' : 'iso3Code' , 
                                'CPI score 2023' : 'CPI_score_2023'
                                
                                })

cpi_df = cpi_df[['country' , 'iso3Code', 'CPI_score_2023']]

cpi_df.columns

Index(['country', 'iso3Code', 'CPI_score_2023'], dtype='object')

In [18]:
cpi_df.shape

(180, 3)

In [19]:
cpi_df.head(5)

Unnamed: 0,country,iso3Code,CPI_score_2023
0,Denmark,DNK,90
1,Finland,FIN,87
2,New Zealand,NZL,85
3,Norway,NOR,84
4,Singapore,SGP,83


In [20]:
# Save to CSV 
cpi_df.to_csv("../Output/corruption_perception_index.csv", index=False)

# (5) Extract Global_Peace_Index data
source Wikidepia : https://en.wikipedia.org/wiki/Global_Peace_Index

the full data and the report is available as PDF from the Economics And Peace organization
link : https://www.economicsandpeace.org/wp-content/uploads/2023/09/GPI-2023-Web.pdf

In [8]:
# extract the data from Wikipedia page

## define URL
url = "https://en.wikipedia.org/wiki/Global_Peace_Index"

# Read all tables from the page
tables = pd.read_html(url)

# Find the correct table
gpi_df = tables[1]  

# Display the extracted table
gpi_df = gpi_df.sort_values('Country', ascending=True)
gpi_df.head()


Unnamed: 0,Rank,Country,Score,Change
159,160,Afghanistan,3.294,
41,42,Albania,1.809,3.0
89,90,Algeria,2.11,2.0
71,72,Angola,2.043,19.0
46,47,Argentina,1.855,2.0


In [9]:
# rename column and drop other column
gpi_df = gpi_df.rename(columns={
                                'Country' : 'country',
                                'Score' : 'Global_Peace_Index_2023'
                                
                                })
gpi_df = gpi_df[['country', 'Global_Peace_Index_2023']]


gpi_df.head()

Unnamed: 0,country,Global_Peace_Index_2023
159,Afghanistan,3.294
41,Albania,1.809
89,Algeria,2.11
71,Angola,2.043
46,Argentina,1.855


#### for the steps below, refer to 2.1 above

In [10]:
# to identify the countries names that mismatch between IRCC and World bank

countries_wb = pd.DataFrame(countries_wb_df['country'].unique() , columns=['country'])
countries_gpi = pd.DataFrame(gpi_df['country'].unique(),  columns=['country'])

# Add a source column to indicate the origin
countries_wb['source'] = 'countries_wb'
countries_gpi['source'] = 'countries_gpi'

# Concatenate the two DataFrames
all_countries = pd.concat([countries_wb, countries_gpi], ignore_index=True)

# Find non-corresponding values
non_corresponding_values = all_countries.groupby('country').filter(lambda x: len(x) == 1)

# Display the non-corresponding values with their origin
print(non_corresponding_values)

# to change the countries name, printexport this DF below and work with the resul
# non_corresponding_values.to_csv('Output/non_corresponding_countries.csv')

                 country         source
0                  Aruba   countries_wb
4                Andorra   countries_wb
8         American Samoa   countries_wb
9    Antigua and Barbuda   countries_wb
20          Bahamas, The   countries_wb
..                   ...            ...
362           The Gambia  countries_gpi
366               Turkey  countries_gpi
375            Venezuela  countries_gpi
376              Vietnam  countries_gpi
377                Yemen  countries_gpi

[96 rows x 2 columns]


In [11]:
# Change countries name to match to th United Nations standard

gpi_df['country'] = gpi_df['country'].replace({
                            
                                                'Czech Republic' : 'Czechia' ,
                                                'Democratic Republic of the Congo' : 'Congo, Dem. Rep.' ,
                                                'East Timor' : 'Timor-Leste' ,
                                                'Egypt' : 'Egypt, Arab Rep.' ,
                                                'Iran' : 'Iran, Islamic Rep.' ,
                                                'Ivory Coast' : "Cote d'Ivoire" ,
                                                'Kyrgyzstan' : 'Kyrgyz Republic' ,
                                                'Laos' : 'Lao PDR' ,
                                                'North Korea' : "Korea, Dem. People's Rep." ,
                                                'Palestine' : 'West Bank and Gaza' ,
                                                'Republic of the Congo' : 'Congo, Rep.' ,
                                                'Russia' : 'Russian Federation' ,
                                                'Slovakia' : 'Slovak Republic' ,
                                                'South Korea' : 'Korea, Rep.' ,
                                                'Syria' : 'Syrian Arab Republic' ,
                                                'Taiwan' : 'Taiwan' ,
                                                'The Gambia' : 'Gambia, The' ,
                                                'Turkey' : 'Turkiye' ,
                                                'Venezuela' : 'Venezuela, RB' ,
                                                'Vietnam' : 'Viet Nam' ,
                                                'Yemen' : 'Yemen, Rep.' 

                                     })



In [12]:
# to identify the countries names that mismatch between IRCC and World bank

countries_wb = pd.DataFrame(countries_wb_df['country'].unique() , columns=['country'])
countries_gpi = pd.DataFrame(gpi_df['country'].unique(),  columns=['country'])

# Add a source column to indicate the origin
countries_wb['source'] = 'countries_wb'
countries_gpi['source'] = 'countries_gpi'

# Concatenate the two DataFrames
all_countries = pd.concat([countries_wb, countries_gpi], ignore_index=True)

# Find non-corresponding values
non_corresponding_values = all_countries.groupby('country').filter(lambda x: len(x) == 1)

# Display the non-corresponding values with their origin
print(non_corresponding_values)


                            country         source
0                             Aruba   countries_wb
4                           Andorra   countries_wb
8                    American Samoa   countries_wb
9               Antigua and Barbuda   countries_wb
20                     Bahamas, The   countries_wb
23                           Belize   countries_wb
24                          Bermuda   countries_wb
27                         Barbados   countries_wb
28                Brunei Darussalam   countries_wb
34                  Channel Islands   countries_wb
42                          Comoros   countries_wb
43                       Cabo Verde   countries_wb
46                          Curacao   countries_wb
47                   Cayman Islands   countries_wb
52                         Dominica   countries_wb
63                             Fiji   countries_wb
65                    Faroe Islands   countries_wb
66            Micronesia, Fed. Sts.   countries_wb
71                        Gibra

In [43]:
gpi_df.shape

(163, 2)

In [13]:
# Save to CSV 
gpi_df.to_csv("../Output/Global_Peace_Index.csv", index=False)