In [3]:
## Dependencies

import pandas as pd
import pathlib as path



import requests
import json
from pprint import pprint

# import numpy as np
# from scipy.stats import linregress

# import matplotlib.pyplot as plt
# import hvplot.pandas
# import geopandas as gpd

# (1) collect immigration data 

##### Data source from IRCC web site
https://www.ircc.canada.ca/opendata-donneesouvertes/data/ODP-PR-Citz.csv
from
https://ouvert.canada.ca/data/fr/dataset/f7e5498e-0ad8-4417-85c9-9b8aff9b9eda/resource/5f5fa9ca-b621-4dac-91d1-25654a25735c

In [4]:
### import and read csv file

path_imm = 'https://www.ircc.canada.ca/opendata-donneesouvertes/data/ODP-PR-Citz.csv'
imm_df = pd.read_csv(path_imm, delimiter='\t')

imm_df.to_csv('Output/immigration_raw_data.csv')
imm_df.head(5)

Unnamed: 0,EN_YEAR,EN_QUARTER,EN_MONTH,FR_ANNEÉ,FR_TRIMESTRE,FR_MOIS,EN_COUNTRY_OF_CITIZENSHIP,FR_PAYS_DE_CITOYENNETÉ,TOTAL
0,2015,Q1,Feb,2015,T1,fév.,Afghanistan,Afghanistan,125
1,2015,Q1,Feb,2015,T1,fév.,Albania,Albanie,25
2,2015,Q1,Feb,2015,T1,fév.,Algeria,Algérie,125
3,2015,Q1,Feb,2015,T1,fév.,Antigua and Barbuda,Antigua-et-Barbuda,5
4,2015,Q1,Feb,2015,T1,fév.,Argentina,Argentine,15


In [5]:
#check datatype 

imm_df.dtypes

EN_YEAR                       int64
EN_QUARTER                   object
EN_MONTH                     object
FR_ANNEÉ                      int64
FR_TRIMESTRE                 object
FR_MOIS                      object
EN_COUNTRY_OF_CITIZENSHIP    object
FR_PAYS_DE_CITOYENNETÉ       object
TOTAL                        object
dtype: object

In [6]:
# drop useless columns

imm_clean_df = imm_df.drop(columns=['FR_ANNEÉ','FR_TRIMESTRE','FR_MOIS','FR_PAYS_DE_CITOYENNETÉ'])
imm_clean_df.columns

Index(['EN_YEAR', 'EN_QUARTER', 'EN_MONTH', 'EN_COUNTRY_OF_CITIZENSHIP',
       'TOTAL'],
      dtype='object')

In [7]:
# check empty values with 1 example raw 42

print(imm_clean_df.loc[42,])

EN_YEAR                          2015
EN_QUARTER                         Q1
EN_MONTH                          Feb
EN_COUNTRY_OF_CITIZENSHIP    Dominica
TOTAL                              --
Name: 42, dtype: object


In [8]:
# convert Total to numeric

imm_clean_df['TOTAL'] = pd.to_numeric(imm_clean_df['TOTAL'], errors='coerce')

# replace empty with 0 

imm_clean_df = imm_clean_df.fillna({'TOTAL': 0})
print(imm_clean_df.loc[42,])

EN_YEAR                          2015
EN_QUARTER                         Q1
EN_MONTH                          Feb
EN_COUNTRY_OF_CITIZENSHIP    Dominica
TOTAL                             0.0
Name: 42, dtype: object


In [9]:
# check that total is numeric _ integer

imm_clean_df.dtypes

EN_YEAR                        int64
EN_QUARTER                    object
EN_MONTH                      object
EN_COUNTRY_OF_CITIZENSHIP     object
TOTAL                        float64
dtype: object

In [None]:
# To DELETE ???

# imm_clean_df = imm_clean_df.drop(columns=['FR_ANNEÉ','FR_TRIMESTRE','FR_MOIS','FR_PAYS_DE_CITOYENNETÉ'])
# imm_clean_df.columns

In [10]:
# rename columns 

imm_clean_df = imm_clean_df.rename(columns={
                                            'EN_YEAR'            :           'year' ,
                                            'EN_QUARTER'         :          'quarter' , 
                                            'EN_MONTH'           :          'month_str' ,
                                            'EN_COUNTRY_OF_CITIZENSHIP' :   'Country' ,
                                            'TOTAL'              :           'total'
                                            })
imm_clean_df.dtypes

year           int64
quarter       object
month_str     object
Country       object
total        float64
dtype: object

In [11]:
# add months with numerical values
 ## check that there is no strange value 

months_str = imm_clean_df['month_str'].unique()
months_str

array(['Feb', 'Jan', 'Mar', 'Apr', 'Jun', 'May', 'Aug', 'Jul', 'Sep',
       'Dec', 'Nov', 'Oct'], dtype=object)

In [12]:
# use of Mapping to add integers to month abbreviations

month_mapping = {
    'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
    'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
}

# Map month abbreviations to integers using the dictionary
imm_clean_df['month_int'] = imm_clean_df['month_str'].map(month_mapping)

print(imm_clean_df.head())

   year quarter month_str              Country  total  month_int
0  2015      Q1       Feb          Afghanistan  125.0          2
1  2015      Q1       Feb              Albania   25.0          2
2  2015      Q1       Feb              Algeria  125.0          2
3  2015      Q1       Feb  Antigua and Barbuda    5.0          2
4  2015      Q1       Feb            Argentina   15.0          2


In [13]:
# extract useless values from country name (after the comma)
## example Congo

print(imm_clean_df.loc[34,])

year                                    2015
quarter                                   Q1
month_str                                Feb
Country      Congo, People's Republic of the
total                                    0.0
month_int                                  2
Name: 34, dtype: object


In [14]:
# split the country name column using delimiter ','

imm_clean_df[['country', 'country_add']] = imm_clean_df['Country']\
                                                    .str.split(',', expand=True)

print(imm_clean_df.loc[34,])

year                                      2015
quarter                                     Q1
month_str                                  Feb
Country        Congo, People's Republic of the
total                                      0.0
month_int                                    2
country                                  Congo
country_add           People's Republic of the
Name: 34, dtype: object


In [15]:
# keep only country name cleaned 

imm_clean_df = imm_clean_df.drop(columns=['Country','country_add'])
imm_clean_df.columns

Index(['year', 'quarter', 'month_str', 'total', 'month_int', 'country'], dtype='object')

In [16]:
# check with congo

print(imm_clean_df.loc[34,])

year          2015
quarter         Q1
month_str      Feb
total          0.0
month_int        2
country      Congo
Name: 34, dtype: object


In [17]:
# groupby country / year to get the total by country

imm_yearly = imm_clean_df.groupby(['country','year'])['total'].sum()
imm_yearly.to_csv('Output/immigrants_by_country_year.csv')
imm_yearly

country      year
Afghanistan  2015    2625.0
             2016    2655.0
             2017    3460.0
             2018    3560.0
             2019    3890.0
                      ...  
Zimbabwe     2020     335.0
             2021     505.0
             2022     615.0
             2023     555.0
             2024     505.0
Name: total, Length: 1887, dtype: float64

In [18]:
# groupby to get the total by country

imm_bycountry = imm_clean_df.groupby(['country'])['total'].sum()
imm_bycountry.to_csv('Output/immigrants_by_country.csv')
imm_bycountry

country
Afghanistan       81305.0
Albania            5540.0
Algeria           39655.0
Andorra               0.0
Angola              925.0
                   ...   
Virgin Islands        0.0
Western Sahara        0.0
Yemen              5875.0
Zambia              605.0
Zimbabwe           4610.0
Name: total, Length: 213, dtype: float64

# (2) create DF for countries

#### information will be used to merge or generate vizualisations
source : https://api.worldbank.org/v2/country?format=json

In [19]:
country_list = imm_clean_df['country'].tolist()
country_list

['Afghanistan',
 'Albania',
 'Algeria',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahama Islands',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bhutan',
 'Bolivia',
 'Bosnia-Herzegovina',
 'Botswana',
 'Brazil',
 'Bulgaria',
 'Burkina-Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Congo',
 'Congo',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czech Republic',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Eritrea',
 'Ethiopia',
 'Fiji',
 'Finland',
 'France',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Grenada',
 'Guatemala',
 'Guinea',
 'Guyana',
 'Haiti',
 'Honduras',
 'Hong Kong SAR',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Iran',
 'Iraq',
 'Ireland',
 'Israel',
 'Italy',
 'Ivory Coast',
 'Jamaica',
 'Japan',
 'Jordan',
 'Kazakhs

In [20]:
## retrieve all the countries from world bank API

### generate DF Countries

page = 1   ## there are 6 pages in the json file
data = []
url_countries = 'https://api.worldbank.org/v2/country?format=json'

for page in range(1,7):
    response = requests.get(f"{url_countries}&page={page}")
    json_data = response.json()
    
    # Add the data from the current page
    data.extend(json_data[1])  
    
## create DF and select columns

countries_wb_df = pd.DataFrame(data)
countries_wb_df['region'] = countries_wb_df['region'].apply(lambda x: x['value'])
countries_wb_df = countries_wb_df[['id', 'iso2Code', 'name', 'region','capitalCity', 'longitude', 'latitude']]
countries_wb_df = countries_wb_df.rename(columns={
                                                    'name':'country',
                                                    'id':'iso3Code'
                                                    })


regions = (countries_wb_df['region'] != "Aggregates")
countries_wb_df = countries_wb_df[regions]


countries_wb_df.head()

Unnamed: 0,iso3Code,iso2Code,country,region,capitalCity,longitude,latitude
0,ABW,AW,Aruba,Latin America & Caribbean,Oranjestad,-70.0167,12.5167
2,AFG,AF,Afghanistan,South Asia,Kabul,69.1761,34.5228
5,AGO,AO,Angola,Sub-Saharan Africa,Luanda,13.242,-8.81155
6,ALB,AL,Albania,Europe & Central Asia,Tirane,19.8172,41.3317
7,AND,AD,Andorra,Europe & Central Asia,Andorra la Vella,1.5218,42.5075


In [21]:
### Export to csv file in output

countries_wb_df.to_csv('Output/countries_list_UN_referential.csv')


# (3) Extract macro economic data
#### from world bank API
source : https://api.worldbank.org/v2/country/DZA/indicator/NY.GDP.PCAP.KD?date=2015:2024&format=json

In [22]:
# the API uses the iso3Code code of the country (example PAK for pakistan, AFG for Afghanistan)
# we need to identify the needed countries from from our immigration_df and retrieve these iso3Code from countries_df

countries_request = pd.merge(imm_clean_df, countries_wb_df, left_on='country', right_on='country')
countries_request

Unnamed: 0,year,quarter,month_str,total,month_int,country,iso3Code,iso2Code,region,capitalCity,longitude,latitude
0,2015,Q1,Feb,125.0,2,Afghanistan,AFG,AF,South Asia,Kabul,69.1761,34.5228
1,2015,Q1,Feb,25.0,2,Albania,ALB,AL,Europe & Central Asia,Tirane,19.8172,41.3317
2,2015,Q1,Feb,125.0,2,Algeria,DZA,DZ,Middle East & North Africa,Algiers,3.05097,36.7397
3,2015,Q1,Feb,5.0,2,Antigua and Barbuda,ATG,AG,Latin America & Caribbean,Saint John's,-61.8456,17.1175
4,2015,Q1,Feb,15.0,2,Argentina,ARG,AR,Latin America & Caribbean,Buenos Aires,-58.4173,-34.6118
...,...,...,...,...,...,...,...,...,...,...,...,...
15932,2024,Q4,Oct,0.0,10,United Arab Emirates,ARE,AE,Middle East & North Africa,Abu Dhabi,54.3705,24.4764
15933,2024,Q4,Oct,5.0,10,Uruguay,URY,UY,Latin America & Caribbean,Montevideo,-56.0675,-34.8941
15934,2024,Q4,Oct,15.0,10,Uzbekistan,UZB,UZ,Europe & Central Asia,Tashkent,69.269,41.3052
15935,2024,Q4,Oct,0.0,10,Zambia,ZMB,ZM,Sub-Saharan Africa,Lusaka,28.2937,-15.3982


In [23]:
# extrac the needed iso3code in a list to be used to iterate

country_list = countries_request['iso3Code'].unique().tolist()
country_list

['AFG',
 'ALB',
 'DZA',
 'ATG',
 'ARG',
 'ARM',
 'AUS',
 'AUT',
 'AZE',
 'BHR',
 'BGD',
 'BRB',
 'BLR',
 'BEL',
 'BLZ',
 'BEN',
 'BTN',
 'BOL',
 'BWA',
 'BRA',
 'BGR',
 'BDI',
 'KHM',
 'CMR',
 'CAF',
 'TCD',
 'CHL',
 'CHN',
 'COL',
 'COM',
 'CRI',
 'HRV',
 'CUB',
 'CYP',
 'DNK',
 'DJI',
 'DMA',
 'DOM',
 'ECU',
 'SLV',
 'ERI',
 'ETH',
 'FJI',
 'FIN',
 'FRA',
 'GEO',
 'DEU',
 'GHA',
 'GRC',
 'GRD',
 'GTM',
 'GIN',
 'GUY',
 'HTI',
 'HND',
 'HUN',
 'ISL',
 'IND',
 'IDN',
 'IRQ',
 'IRL',
 'ISR',
 'ITA',
 'JAM',
 'JPN',
 'JOR',
 'KAZ',
 'KEN',
 'XKX',
 'LVA',
 'LBN',
 'LBR',
 'LBY',
 'LTU',
 'MDG',
 'MYS',
 'MDV',
 'MLI',
 'MRT',
 'MUS',
 'MEX',
 'MDA',
 'MNG',
 'MNE',
 'MAR',
 'NAM',
 'NPL',
 'NLD',
 'NZL',
 'NIC',
 'NER',
 'NGA',
 'NOR',
 'PAK',
 'PAN',
 'PRY',
 'PER',
 'PHL',
 'POL',
 'PRT',
 'ROU',
 'RWA',
 'WSM',
 'SAU',
 'SEN',
 'SRB',
 'SLE',
 'SGP',
 'SVK',
 'SVN',
 'SOM',
 'ZAF',
 'ESP',
 'LKA',
 'LCA',
 'VCT',
 'SDN',
 'SWE',
 'CHE',
 'TJK',
 'TZA',
 'THA',
 'TGO',
 'TTO',
 'TUN',


In [24]:
import requests

In [25]:
# this line is just to check that we are connected to the APIT : O TEST before loop

link_pop = f'https://api.worldbank.org/v2/country/DZA/indicator/NY.GDP.PCAP.KD?date=2015:2024&format=json'
response_pop = requests.get(link_pop).json()

pprint(response_pop)

[{'lastupdated': '2024-12-16',
  'page': 1,
  'pages': 1,
  'per_page': 50,
  'sourceid': '2',
  'total': 9},
 [{'country': {'id': 'DZ', 'value': 'Algeria'},
   'countryiso3code': 'DZA',
   'date': '2023',
   'decimal': 1,
   'indicator': {'id': 'NY.GDP.PCAP.KD',
                 'value': 'GDP per capita (constant 2015 US$)'},
   'obs_status': '',
   'unit': '',
   'value': 4660.40545686886},
  {'country': {'id': 'DZ', 'value': 'Algeria'},
   'countryiso3code': 'DZA',
   'date': '2022',
   'decimal': 1,
   'indicator': {'id': 'NY.GDP.PCAP.KD',
                 'value': 'GDP per capita (constant 2015 US$)'},
   'obs_status': '',
   'unit': '',
   'value': 4544.46688059647},
  {'country': {'id': 'DZ', 'value': 'Algeria'},
   'countryiso3code': 'DZA',
   'date': '2021',
   'decimal': 1,
   'indicator': {'id': 'NY.GDP.PCAP.KD',
                 'value': 'GDP per capita (constant 2015 US$)'},
   'obs_status': '',
   'unit': '',
   'value': 4456.74687641147},
  {'country': {'id': 'DZ', 'valu

In [27]:
# this cell is used to check our code before to launch the loop 
# TO TEST before loop


link_pop = f'https://api.worldbank.org/v2/country/DZA/indicator/NY.GDP.PCAP.KD?date=2015:2024&format=json'
response_pop = requests.get(link_pop).json()

# pprint(response_pop)

pop_list = []
for item in response_pop[1]:  # Assuming response_pop[1] is a list of dictionaries
    pop_list.append({
        'iso3Code': item['countryiso3code'],
        'iso2Code': item['country']['id'],
        'Country': item['country']['value'],              
        'year': item['date'],
        'indicator': item['indicator']['value'],
        'value': item['value']
    })

pop_list

[{'iso3Code': 'DZA',
  'iso2Code': 'DZ',
  'Country': 'Algeria',
  'year': '2023',
  'indicator': 'GDP per capita (constant 2015 US$)',
  'value': 4660.40545686886},
 {'iso3Code': 'DZA',
  'iso2Code': 'DZ',
  'Country': 'Algeria',
  'year': '2022',
  'indicator': 'GDP per capita (constant 2015 US$)',
  'value': 4544.46688059647},
 {'iso3Code': 'DZA',
  'iso2Code': 'DZ',
  'Country': 'Algeria',
  'year': '2021',
  'indicator': 'GDP per capita (constant 2015 US$)',
  'value': 4456.74687641147},
 {'iso3Code': 'DZA',
  'iso2Code': 'DZ',
  'Country': 'Algeria',
  'year': '2020',
  'indicator': 'GDP per capita (constant 2015 US$)',
  'value': 4363.68533766912},
 {'iso3Code': 'DZA',
  'iso2Code': 'DZ',
  'Country': 'Algeria',
  'year': '2019',
  'indicator': 'GDP per capita (constant 2015 US$)',
  'value': 4672.66408682198},
 {'iso3Code': 'DZA',
  'iso2Code': 'DZ',
  'Country': 'Algeria',
  'year': '2018',
  'indicator': 'GDP per capita (constant 2015 US$)',
  'value': 4717.00358853556},
 {'i

In [36]:
# DEFINE THE VARIABLES : INDICATORS, TIME PERIOD

# create the list of indicators
            #  'NY.GDP.PCAP.KD' gdp per capita
            # 'SP.POP.TOTL' total population
            #  SL.UEM.ADVN.ZS : Unemployment with advanced education (% of total labor force with advanced education)
            # SL.UEM.TOTL.NE.ZS : Unemployment, total (% of total labor force) (national estimate)
            # SI.POV.DDAY  Poverty headcount ratio at $2.15 a day (2017 PPP) (% of population)


indicators = ['NY.GDP.PCAP.KD','SP.POP.TOTL' , 'SL.UEM.ADVN.ZS', 'SL.UEM.TOTL.NE.ZS', 'SI.POV.DDAY']

# define 1st and last year f the loop

year_min = 2015
year_max = 2024
years_list = [x for x in range(year_min, year_max +1)]   # COULD BE USED BUT NOT IN THE CURRENT CODE


In [37]:
# this code will loop through all the listed countries and the needed indicators to generate a list

print(f"start code")
print('---------------------------')

macro_eco_data_list = []

# the end point is defined by (i) country (II) indicator (III) year start and year end
        # => we need to create 1 end point of each country / indicator

for i in country_list :

    for j in indicators :

        print(f"--------------------------")
        print(f"country {i} indicator {j}")

        try:

            # define the end point by country (from list I) and indicator (from list J)

            link_pop = f'https://api.worldbank.org/v2/country/{i}/indicator/{j}?date={year_min}:{year_max}&format=json'
            response_pop = requests.get(link_pop).json()

            # loop through the response_pop (list) to extract the value for each year in the serie 

            for item in response_pop[1]:  
                macro_eco_data_list.append({
                    'iso3Code': item['countryiso3code'],
                    'iso2Code': item['country']['id'],
                    'Country': item['country']['value'],              
                    'year': item['date'],
                    'indicator': item['indicator']['value'],
                    'value': item['value']
                })
        except:   
            print(f"country {i} indicator {j}....... not found")

start code
---------------------------
--------------------------
country AFG indicator NY.GDP.PCAP.KD
--------------------------
country AFG indicator SP.POP.TOTL
--------------------------
country AFG indicator SL.UEM.ADVN.ZS
--------------------------
country AFG indicator SL.UEM.TOTL.NE.ZS
--------------------------
country AFG indicator SI.POV.DDAY
--------------------------
country ALB indicator NY.GDP.PCAP.KD
--------------------------
country ALB indicator SP.POP.TOTL
--------------------------
country ALB indicator SL.UEM.ADVN.ZS
--------------------------
country ALB indicator SL.UEM.TOTL.NE.ZS
--------------------------
country ALB indicator SI.POV.DDAY
--------------------------
country DZA indicator NY.GDP.PCAP.KD
--------------------------
country DZA indicator SP.POP.TOTL
--------------------------
country DZA indicator SL.UEM.ADVN.ZS
--------------------------
country DZA indicator SL.UEM.TOTL.NE.ZS
--------------------------
country DZA indicator SI.POV.DDAY
----------

In [38]:
## Create DF 1st indicator

macro_eco_data_df = pd.DataFrame(macro_eco_data_list)
macro_eco_data_df = macro_eco_data_df.dropna(how='all')

macro_eco_data_df.head()

Unnamed: 0,iso3Code,iso2Code,Country,year,indicator,value
0,AFG,AF,Afghanistan,2023,GDP per capita (constant 2015 US$),379.707497
1,AFG,AF,Afghanistan,2022,GDP per capita (constant 2015 US$),377.665627
2,AFG,AF,Afghanistan,2021,GDP per capita (constant 2015 US$),408.625855
3,AFG,AF,Afghanistan,2020,GDP per capita (constant 2015 US$),527.834554
4,AFG,AF,Afghanistan,2019,GDP per capita (constant 2015 US$),557.861533


In [42]:
print(macro_eco_data_df.shape)
print(macro_eco_data_df.dtypes)

(7515, 6)
iso3Code      object
iso2Code      object
Country       object
year          object
indicator     object
value        float64
dtype: object


In [43]:
macro_eco_data_df = macro_eco_data_df.astype({'year':'int64'})
macro_eco_data_df.dtypes

iso3Code      object
iso2Code      object
Country       object
year           int64
indicator     object
value        float64
dtype: object

In [44]:
# export to csv

macro_eco_data_df.to_csv('Output/macro_economic_data.csv')