In [1]:
## Dependencies

import pandas as pd
import pathlib as path



import requests
import json
from pprint import pprint

import numpy as np
from scipy.stats import linregress

from io import StringIO

# import matplotlib.pyplot as plt
# import hvplot.pandas
# import geopandas as gpd

# (1) create DF for countries

#### information will be used to merge or generate vizualisations
source : https://api.worldbank.org/v2/country?format=json

In [2]:
## retrieve all the countries from world bank API

### generate DF Countries

page = 1   ## there are 6 pages in the json file
data = []
url_countries = 'https://api.worldbank.org/v2/country?format=json'

for page in range(1,7):
    response = requests.get(f"{url_countries}&page={page}")
    json_data = response.json()
    
    # Add the data from the current page
    data.extend(json_data[1])  
    
## create DF and select columns

countries_wb_df = pd.DataFrame(data)
countries_wb_df['region'] = countries_wb_df['region'].apply(lambda x: x['value'])
countries_wb_df = countries_wb_df[['id', 'iso2Code', 'name', 'region','capitalCity', 'longitude', 'latitude']]
countries_wb_df = countries_wb_df.rename(columns={
                                                    'name':'country',
                                                    'id':'iso3Code'
                                                    })


regions = (countries_wb_df['region'] != "Aggregates")
countries_wb_df = countries_wb_df[regions]


countries_wb_df.head()

Unnamed: 0,iso3Code,iso2Code,country,region,capitalCity,longitude,latitude
0,ABW,AW,Aruba,Latin America & Caribbean,Oranjestad,-70.0167,12.5167
2,AFG,AF,Afghanistan,South Asia,Kabul,69.1761,34.5228
5,AGO,AO,Angola,Sub-Saharan Africa,Luanda,13.242,-8.81155
6,ALB,AL,Albania,Europe & Central Asia,Tirane,19.8172,41.3317
7,AND,AD,Andorra,Europe & Central Asia,Andorra la Vella,1.5218,42.5075


In [3]:
countries_wb_df = countries_wb_df.set_index('iso3Code')
countries_wb_df

Unnamed: 0_level_0,iso2Code,country,region,capitalCity,longitude,latitude
iso3Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ABW,AW,Aruba,Latin America & Caribbean,Oranjestad,-70.0167,12.5167
AFG,AF,Afghanistan,South Asia,Kabul,69.1761,34.5228
AGO,AO,Angola,Sub-Saharan Africa,Luanda,13.242,-8.81155
ALB,AL,Albania,Europe & Central Asia,Tirane,19.8172,41.3317
AND,AD,Andorra,Europe & Central Asia,Andorra la Vella,1.5218,42.5075
...,...,...,...,...,...,...
XKX,XK,Kosovo,Europe & Central Asia,Pristina,20.926,42.565
YEM,YE,"Yemen, Rep.",Middle East & North Africa,Sana'a,44.2075,15.352
ZAF,ZA,South Africa,Sub-Saharan Africa,Pretoria,28.1871,-25.746
ZMB,ZM,Zambia,Sub-Saharan Africa,Lusaka,28.2937,-15.3982


In [4]:
### Export to csv file in output

countries_wb_df.to_csv('Output/countries_list_UN_referential.csv')


# (2) collect immigration data 

##### Data source from IRCC web site
https://www.ircc.canada.ca/opendata-donneesouvertes/data/ODP-PR-Citz.csv
from
https://ouvert.canada.ca/data/fr/dataset/f7e5498e-0ad8-4417-85c9-9b8aff9b9eda/resource/5f5fa9ca-b621-4dac-91d1-25654a25735c

In [2]:
# # Define the path to your CSV file
# path_imm = 'https://www.ircc.canada.ca/opendata-donneesouvertes/data/ODP-PR-Citz.csv'

# # Fetch the CSV file content
# response = requests.get(path_imm)
# response.raise_for_status()  # Check if the request was successful

# # Decode the content from binary to text
# content = response.content.decode('utf-8')

# # Use StringIO to read the content as if it were a file
# content_io = StringIO(content)

# # Read the CSV file into a DataFrame using pandas
# imm_df = pd.read_csv(content_io, delimiter='\t', dtype=str)


In [None]:
# imm_df.head(5)

In [5]:
### import and read csv file

path_imm = 'https://www.ircc.canada.ca/opendata-donneesouvertes/data/ODP-PR-Citz.csv'
imm_df = pd.read_csv(path_imm, delimiter='\t')

imm_df.to_csv('Resources/immigration_raw_data.csv')
imm_df.head(5)

Unnamed: 0,EN_YEAR,EN_QUARTER,EN_MONTH,FR_ANNEÉ,FR_TRIMESTRE,FR_MOIS,EN_COUNTRY_OF_CITIZENSHIP,FR_PAYS_DE_CITOYENNETÉ,TOTAL
0,2015,Q1,Feb,2015,T1,fév.,Afghanistan,Afghanistan,125
1,2015,Q1,Feb,2015,T1,fév.,Albania,Albanie,25
2,2015,Q1,Feb,2015,T1,fév.,Algeria,Algérie,125
3,2015,Q1,Feb,2015,T1,fév.,Antigua and Barbuda,Antigua-et-Barbuda,5
4,2015,Q1,Feb,2015,T1,fév.,Argentina,Argentine,15


In [6]:
# drop useless columns

imm_df = imm_df.drop(columns=['FR_ANNEÉ','FR_TRIMESTRE','FR_MOIS','FR_PAYS_DE_CITOYENNETÉ'])
imm_df.head(5)

Unnamed: 0,EN_YEAR,EN_QUARTER,EN_MONTH,EN_COUNTRY_OF_CITIZENSHIP,TOTAL
0,2015,Q1,Feb,Afghanistan,125
1,2015,Q1,Feb,Albania,25
2,2015,Q1,Feb,Algeria,125
3,2015,Q1,Feb,Antigua and Barbuda,5
4,2015,Q1,Feb,Argentina,15


In [7]:
# check empty values with 1 example raw 42

print(imm_df.loc[42,])

EN_YEAR                          2015
EN_QUARTER                         Q1
EN_MONTH                          Feb
EN_COUNTRY_OF_CITIZENSHIP    Dominica
TOTAL                              --
Name: 42, dtype: object


In [8]:
imm_clean_df = imm_df.copy()

In [9]:
# convert Total to numeric

imm_clean_df['TOTAL'] = pd.to_numeric(imm_clean_df['TOTAL'], errors='coerce')

# replace NAN with 0 and change data type

imm_clean_df = imm_clean_df.fillna({'TOTAL': 0})
imm_clean_df = imm_clean_df.astype({'TOTAL': 'int64'})
print(imm_clean_df.loc[42,])

EN_YEAR                          2015
EN_QUARTER                         Q1
EN_MONTH                          Feb
EN_COUNTRY_OF_CITIZENSHIP    Dominica
TOTAL                               0
Name: 42, dtype: object


In [10]:
# check that total is numeric _ integer

imm_clean_df.dtypes

EN_YEAR                       int64
EN_QUARTER                   object
EN_MONTH                     object
EN_COUNTRY_OF_CITIZENSHIP    object
TOTAL                         int64
dtype: object

In [11]:
# rename columns 

imm_clean_df = imm_clean_df.rename(columns={
                                            'EN_YEAR'            :           'year' ,
                                            'EN_QUARTER'         :          'quarter' , 
                                            'EN_MONTH'           :          'month_str' ,
                                            'EN_COUNTRY_OF_CITIZENSHIP' :   'country' ,  # will be splitted and replaced
                                            'TOTAL'              :           'immigration_flow'
                                            })
imm_clean_df.dtypes

year                 int64
quarter             object
month_str           object
country             object
immigration_flow     int64
dtype: object

In [12]:
countries_wb = pd.DataFrame(countries_wb_df['country'].unique() , columns=['country'])
countries_ircc = pd.DataFrame(imm_clean_df['country'].unique(),  columns=['country'])

# Add a source column to indicate the origin
countries_wb['source'] = 'countries_wb'
countries_ircc['source'] = 'countries_ircc'

# Concatenate the two DataFrames
all_countries = pd.concat([countries_wb, countries_ircc], ignore_index=True)

# Find non-corresponding values
non_corresponding_values = all_countries.groupby('country').filter(lambda x: len(x) == 1)

# Display the non-corresponding values with their origin
print(non_corresponding_values)
non_corresponding_values.to_csv('Output/non_corresponding_countries.csv')

            country          source
0             Aruba    countries_wb
8    American Samoa    countries_wb
15            Benin    countries_wb
16     Burkina Faso    countries_wb
20     Bahamas, The    countries_wb
..              ...             ...
423        Holy See  countries_ircc
424          Azores  countries_ircc
427      Guadeloupe  countries_ircc
431         Reunion  countries_ircc
432    Sint-Maarten  countries_ircc

[159 rows x 2 columns]


In [13]:
# Change countries name to match to th United Nations standard

imm_clean_df['country'] = imm_clean_df['country'].replace({
                            
                                    'Azores'  :  'Canada' ,
                                    'Bahama Islands, The'  :  'Bahamas, The' ,
                                    'Benin, Republic of'  :  'Benin' ,
                                    'Bosnia-Herzegovina'  :  'Bosnia and Herzegovina' ,
                                    'Botswana, Republic of'  :  'Botswana' ,
                                    'Brunei'  :  'Brunei Darussalam' ,
                                    'Burkina-Faso'  :  'Burkina Faso' ,
                                    'Cameroon, Federal Republic of'  :  'Cameroon' ,
                                    'Cape Verde Islands'  :  'Cabo Verde' ,
                                    'Chad, Republic of'  :  'Chad' ,
                                    "China, People's Republic of"  :  'China' ,
                                    'Congo, Democratic Republic of the'  :  'Congo, Dem. Rep.' ,
                                    "Congo, People's Republic of the"  :  'Congo, Rep.' ,
                                    'Country not stated'  :  'Canada' ,
                                    'Czech Republic'  :  'Czechia' ,
                                    'Djibouti, Republic of'  :  'Djibouti' ,
                                    'East Timor, Democratic Republic of'  :  'Timor-Leste' ,
                                    'Egypt'  :  'Egypt, Arab Rep.' ,
                                    'Equatorial Guinea, Republic of'  :  'Equatorial Guinea' ,
                                    'Gabon Republic'  :  'Gabon' ,
                                    'Gambia'  :  'Gambia, The' ,
                                    'Guadeloupe'  :  'France' ,
                                    'Guinea, Republic of'  :  'Guinea' ,
                                    'Holy See'  :  'Canada' ,
                                    'Hong Kong SAR'  :  'Hong Kong SAR, China' ,
                                    'Indonesia, Republic of'  :  'Indonesia' ,
                                    'Iran'  :  'Iran, Islamic Rep.' ,
                                    'Ireland, Republic of'  :  'Ireland' ,
                                    'Ivory Coast, Republic of'  :  "Cote d'Ivoire" ,
                                    "Korea, People's Democratic Republic of"  :  "Korea, Dem. People's Rep." ,
                                    'Korea, Republic of'  :  'Korea, Rep.' ,
                                    'Kosovo, Republic of'  :  'Kosovo' ,
                                    'Kyrgyzstan'  :  'Kyrgyz Republic' ,
                                    'Laos'  :  'Lao PDR' ,
                                    'Macau SAR'  :  'Macao SAR, China' ,
                                    'Macedonia'  :  'North Macedonia' ,
                                    'Maldives, Republic of'  :  'Maldives' ,
                                    'Mali, Republic of'  :  'Mali' ,
                                    'Marshall Islands, Republic of the'  :  'Marshall Islands' ,
                                    'Micronesia, Federated States of'  :  'Canada' ,
                                    "Mongolia, People's Republic of"  :  'Mongolia' ,
                                    'Montenegro, Republic of'  :  'Montenegro' ,
                                    'Myanmar (Burma)'  :  'Myanmar' ,
                                    'Netherlands Antilles, The'  :  'Canada' ,
                                    'Netherlands, The'  :  'Netherlands' ,
                                    'Nevis'  :  'Canada' ,
                                    'Niger, Republic of the'  :  'Niger' ,
                                    'Northern Mariana Islands, Commonwealth of the'  :  'Northern Mariana Islands' ,
                                    'Other'  :  'Canada' ,
                                    'Palau, Republic of'  :  'Palau' ,
                                    'Palestinian Authority (Gaza/West Bank)'  :  'West Bank and Gaza' ,
                                    'Panama, Republic of'  :  'Panama' ,
                                    'Reunion'  :  'France' ,
                                    'Russia'  :  'Russian Federation' ,
                                    'Samoa, American'  :  'Samoa' ,
                                    'Samoa, Independent State of'  :  'American Samoa' ,
                                    'Serbia, Republic of'  :  'Serbia' ,
                                    'Sint-Maarten'  :  'Sint Maarten (Dutch part)' ,
                                    'Somalia, Democratic Republic of'  :  'Somalia' ,
                                    'South Africa, Republic of'  :  'South Africa' ,
                                    'South Sudan, Republic of'  :  'South Sudan' ,
                                    'St. Kitts-Nevis'  :  'Canada' ,
                                    'Stateless'  :  'Canada' ,
                                    'Sudan, Democratic Republic of'  :  'Sudan' ,
                                    'Surinam'  :  'Suriname' ,
                                    'Swaziland'  :  'Somalia' ,
                                    'Syria'  :  'Syrian Arab Republic' ,
                                    'Taiwan'  :  'China' ,
                                    'Tanzania, United Republic of'  :  'Tanzania' ,
                                    'Togo, Republic of'  :  'Togo' ,
                                    'Trinidad and Tobago, Republic of'  :  'Trinidad and Tobago' ,
                                    'Turkey'  :  'Turkiye' ,
                                    'United Kingdom and Overseas Territories'  :  'United Kingdom' ,
                                    'United States of America'  :  'United States' ,
                                    'Venezuela'  :  'Venezuela, RB' ,
                                    'Vietnam'  :  'Viet Nam' ,
                                    'Virgin Islands, British'  :  'British Virgin Islands' ,
                                    'Western Sahara'  :  'Canada' ,
                                    'Yemen'  :  'Yemen, Rep.' ,


                                     })



In [14]:
countries_wb = pd.DataFrame(countries_wb_df['country'].unique() , columns=['country'])
countries_ircc = pd.DataFrame(imm_clean_df['country'].unique(),  columns=['country'])

# Add a source column to indicate the origin
countries_wb['source'] = 'countries_wb'
countries_ircc['source'] = 'countries_ircc'

# Concatenate the two DataFrames
all_countries = pd.concat([countries_wb, countries_ircc], ignore_index=True)

# Find non-corresponding values
non_corresponding_values = all_countries.groupby('country').filter(lambda x: len(x) == 1)

# Display the non-corresponding values with their origin
print(non_corresponding_values)  ## you must have only countries_wb source


                      country        source
0                       Aruba  countries_wb
34            Channel Islands  countries_wb
47             Cayman Islands  countries_wb
65              Faroe Islands  countries_wb
66      Micronesia, Fed. Sts.  countries_wb
71                  Gibraltar  countries_wb
78                  Greenland  countries_wb
80                       Guam  countries_wb
88                Isle of Man  countries_wb
104       St. Kitts and Nevis  countries_wb
119  St. Martin (French part)  countries_wb
183                  Eswatini  countries_wb
187  Turks and Caicos Islands  countries_wb
208     Virgin Islands (U.S.)  countries_wb


In [98]:
# add months with numerical values
 ## check that there is no strange value 

months_str = imm_clean_df['month_str'].unique()
months_str

array(['Feb', 'Jan', 'Mar', 'Apr', 'Jun', 'May', 'Aug', 'Jul', 'Sep',
       'Dec', 'Nov', 'Oct'], dtype=object)

In [15]:
# use of Mapping to add integers to month abbreviations

month_mapping = {
    'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
    'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
}

# Map month abbreviations to integers using the dictionary
imm_clean_df['month_int'] = imm_clean_df['month_str'].map(month_mapping)

imm_clean_df.head(5)

Unnamed: 0,year,quarter,month_str,country,immigration_flow,month_int
0,2015,Q1,Feb,Afghanistan,125,2
1,2015,Q1,Feb,Albania,25,2
2,2015,Q1,Feb,Algeria,125,2
3,2015,Q1,Feb,Antigua and Barbuda,5,2
4,2015,Q1,Feb,Argentina,15,2


In [16]:
# organize columns

imm_clean_df = imm_clean_df[['country', 'year', 'month_str', 
                            'month_int','quarter', 'immigration_flow']]
imm_clean_df.head(5)

Unnamed: 0,country,year,month_str,month_int,quarter,immigration_flow
0,Afghanistan,2015,Feb,2,Q1,125
1,Albania,2015,Feb,2,Q1,25
2,Algeria,2015,Feb,2,Q1,125
3,Antigua and Barbuda,2015,Feb,2,Q1,5
4,Argentina,2015,Feb,2,Q1,15


In [17]:
imm_new_df = imm_clean_df.reset_index()
imm_new_df.head(5)

Unnamed: 0,index,country,year,month_str,month_int,quarter,immigration_flow
0,0,Afghanistan,2015,Feb,2,Q1,125
1,1,Albania,2015,Feb,2,Q1,25
2,2,Algeria,2015,Feb,2,Q1,125
3,3,Antigua and Barbuda,2015,Feb,2,Q1,5
4,4,Argentina,2015,Feb,2,Q1,15


In [18]:
# export to CSV

imm_new_df.to_csv('Output/immigrants_by_country_monthly.csv', index=False)

In [19]:
# groupby country / year to get the total by country

imm_yearly = imm_new_df.groupby(['country','year'])['immigration_flow'].sum()
imm_yearly.to_csv('Output/immigrants_by_country_year.csv')
imm_yearly

country      year
Afghanistan  2015    2625
             2016    2655
             2017    3460
             2018    3560
             2019    3890
                     ... 
Zimbabwe     2020     335
             2021     505
             2022     615
             2023     555
             2024     505
Name: immigration_flow, Length: 1855, dtype: int64

In [20]:
# groupby to get the total by country

imm_bycountry = imm_new_df.groupby(['country'])['immigration_flow'].sum()
imm_bycountry.to_csv('Output/immigrants_by_country.csv')
imm_bycountry

country
Afghanistan           81305
Albania                5540
Algeria               39655
American Samoa            0
Andorra                   0
                      ...  
Viet Nam              41200
West Bank and Gaza     5920
Yemen, Rep.            5875
Zambia                  605
Zimbabwe               4610
Name: immigration_flow, Length: 203, dtype: int64

# (3) Extract macro economic data
#### from world bank API
source : https://api.worldbank.org/v2/country/DZA/indicator/NY.GDP.PCAP.KD?date=2015:2024&format=json

In [None]:
# the API uses the iso3Code code of the country (example PAK for pakistan, AFG for Afghanistan)
# we need to identify the needed countries from from our immigration_df and retrieve these iso3Code from countries_df

countries_request = pd.merge(imm_new_df, countries_wb_df, left_on='country', right_on='country')
countries_request

In [None]:
# extrac the needed iso3code in a list to be used to iterate

country_list = countries_request['iso3Code'].unique().tolist()
country_list

In [59]:
import requests

In [None]:
# this line is just to check that we are connected to the APIT : O TEST before loop

link_pop = f'https://api.worldbank.org/v2/country/DZA/indicator/NY.GDP.PCAP.KD?date=2015:2024&format=json'
response_pop = requests.get(link_pop).json()

pprint(response_pop)

In [None]:
# this cell is used to check our code before to launch the loop 
# TO TEST before loop


link_pop = f'https://api.worldbank.org/v2/country/DZA/indicator/NY.GDP.PCAP.KD?date=2015:2024&format=json'
response_pop = requests.get(link_pop).json()

# pprint(response_pop)

pop_list = []
for item in response_pop[1]:  # Assuming response_pop[1] is a list of dictionaries
    pop_list.append({
        'iso3Code': item['countryiso3code'],
        'iso2Code': item['country']['id'],
        'Country': item['country']['value'],              
        'year': item['date'],
        'indicator': item['indicator']['value'],
        'value': item['value']
    })

pop_list

In [62]:
# DEFINE THE VARIABLES : INDICATORS, TIME PERIOD

# create the list of indicators
            #  'NY.GDP.PCAP.KD' gdp per capita
            # 'SP.POP.TOTL' total population
            #  SL.UEM.ADVN.ZS : Unemployment with advanced education (% of total labor force with advanced education)
            # SL.UEM.TOTL.NE.ZS : Unemployment, total (% of total labor force) (national estimate)
            # SI.POV.DDAY  Poverty headcount ratio at $2.15 a day (2017 PPP) (% of population)

            # Population density (people per sq. km of land area)	EN.POP.DNST
            # Level of water stress: freshwater withdrawal as a proportion of available freshwater resources	ER.H2O.FWST.ZS
            # Market capitalization of listed domestic companies (% of GDP) CM.MKT.LCAP.GD.ZS
            # Rural population (% of total population) SP.RUR.TOTL.ZS
            # S&P Global Equity Indices (annual % change)	CM.MKT.INDX.ZG
            # Voice and Accountability: Number of Sources	VA.NO.SRC



indicators = ['NY.GDP.PCAP.KD','SP.POP.TOTL' , 'SL.UEM.ADVN.ZS', 'SL.UEM.TOTL.NE.ZS', 'SI.POV.DDAY',
              'EN.POP.DNST', 'ER.H2O.FWST.ZS', 'CM.MKT.LCAP.GD.ZS', 'SP.RUR.TOTL.ZS', 'CM.MKT.INDX.ZG', 'VA.NO.SRC' ]

# define 1st and last year f the loop

year_min = 2015
year_max = 2024
years_list = [x for x in range(year_min, year_max +1)]   # COULD BE USED BUT NOT IN THE CURRENT CODE


In [None]:
# lOOP to extrac tdata from API : between 7 and 9 minutes

# this code will loop through all the listed countries and the needed indicators to generate a list

print(f"start code")
print('---------------------------')

macro_eco_data_list = []

# the end point is defined by (i) country (II) indicator (III) year start and year end
        # => we need to create 1 end point of each country / indicator

for i in country_list :

    for j in indicators :

        print(f"--------------------------")
        print(f"country {i} indicator {j}")

        try:

            # define the end point by country (from list I) and indicator (from list J)

            link_pop = f'https://api.worldbank.org/v2/country/{i}/indicator/{j}?date={year_min}:{year_max}&format=json'
            response_pop = requests.get(link_pop).json()

            # loop through the response_pop (list) to extract the value for each year in the serie 

            for item in response_pop[1]:  
                macro_eco_data_list.append({
                    'iso3Code': item['countryiso3code'],
                    'iso2Code': item['country']['id'],
                    'country': item['country']['value'],              
                    'year': item['date'],
                    'indicator': item['indicator']['value'],
                    'value': item['value']
                })
        except:   
            print(f"country {i} indicator {j}....... not found")

In [None]:
## Create DF with the previous output

macro_eco_data_df = pd.DataFrame(macro_eco_data_list)
macro_eco_data_df = macro_eco_data_df.dropna(how='all')

macro_eco_data_df.head()

In [None]:
print(f'size of the DF : {macro_eco_data_df.shape}')
print('................')
print(macro_eco_data_df.dtypes)

In [None]:
# change datatype 

macro_eco_data_df = macro_eco_data_df.astype({'year':'int64'})
macro_eco_data_df.dtypes

In [30]:
# export to csv

macro_eco_data_df.to_csv('Output/macro_economic_data.csv')