In [1]:
import pandas as pd
import pathlib as path

import requests
import json
from pprint import pprint

import numpy as np
from scipy.stats import linregress

import matplotlib.pyplot as plt
import hvplot.pandas
import geopandas as gpd

# IMPORT FILES

In [61]:
### import and read csv file

path = '../Resources_Output/immigrants_by_country_monthly.csv'
imm_df = pd.read_csv(path)
imm_df = imm_df.rename(columns={'total':'Immigration_flow'})

imm_df.head(5)

Unnamed: 0,index,country,year,month_str,month_int,quarter,immigration_flow
0,0,Afghanistan,2015,Feb,2,Q1,125
1,1,Albania,2015,Feb,2,Q1,25
2,2,Algeria,2015,Feb,2,Q1,125
3,3,Antigua and Barbuda,2015,Feb,2,Q1,5
4,4,Argentina,2015,Feb,2,Q1,15


In [64]:
### import and read csv file

path = '../Resources_Output/countries_UN_referential.csv'
countries_df = pd.read_csv(path)


countries_df.head(5)

Unnamed: 0,country,iso3Code,iso2Code,region,continent,capitalCity,longitude,latitude
0,Aruba,ABW,AW,Latin America & Caribbean,North America,Oranjestad,-70.0167,12.5167
1,Afghanistan,AFG,AF,South Asia,Asia,Kabul,69.1761,34.5228
2,Angola,AGO,AO,Sub-Saharan Africa,Africa,Luanda,13.242,-8.81155
3,Albania,ALB,AL,Europe & Central Asia,Europe,Tirane,19.8172,41.3317
4,Andorra,AND,AD,Europe & Central Asia,Europe,Andorra la Vella,1.5218,42.5075


In [63]:
### import and read csv file

path = '../Resources_Output/macro_economic_data_long_filtred.csv' 
macro_data_df = pd.read_csv(path)

# #drop Nan values
# macro_data_df = macro_data_df['value'].dropna(how="all")

macro_data_df.head(5)

Unnamed: 0.1,Unnamed: 0,index,country,indicator,Indicator Code,years,Value
0,0,22357720,Afghanistan,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,2016,28.8
1,1,22357723,Afghanistan,Access to electricity (% of population),EG.ELC.ACCS.ZS,2016,97.7
2,2,22357779,Afghanistan,Agricultural irrigated land (% of total agricu...,AG.LND.IRIG.AG.ZS,2016,6.48114
3,3,22357780,Afghanistan,Agricultural land (% of land area),AG.LND.AGRI.ZS,2016,58.123668
4,4,22357781,Afghanistan,Agricultural land (sq. km),AG.LND.AGRI.K2,2016,379100.0


# immigration flow

In [30]:
year_max = 2025
imm_filtred = imm_df[imm_df['year']<year_max]

print(f'year min : {imm_filtred['year'].min()}')
print(f'year max : {imm_filtred['year'].max()}')

year min : 2015
year max : 2024


### Yearly flow by country for period

In [31]:
# yearly data by country

immigration_country_year = imm_filtred.groupby(['country', 'year'])['immigration_flow'].sum()
immigration_country_year = immigration_country_year.reset_index()

immigration_country_year

Unnamed: 0,country,year,immigration_flow
0,Afghanistan,2015,2625
1,Afghanistan,2016,2655
2,Afghanistan,2017,3460
3,Afghanistan,2018,3560
4,Afghanistan,2019,3890
...,...,...,...
1853,Zimbabwe,2020,335
1854,Zimbabwe,2021,505
1855,Zimbabwe,2022,615
1856,Zimbabwe,2023,555


In [32]:
# export to csv
immigration_country_year.to_csv('../Outputs/immigration_canada_country_year.csv')

### cumulated flow 2015-2024

In [34]:
# yearly data by country

immigration_country = imm_filtred.groupby(['country'])['immigration_flow'].sum()
immigration_country = immigration_country.reset_index()

immigration_country

Unnamed: 0,country,immigration_flow
0,Afghanistan,82620
1,Albania,5615
2,Algeria,40685
3,American Samoa,0
4,Andorra,0
...,...,...
199,Viet Nam,41975
200,West Bank and Gaza,6025
201,"Yemen, Rep.",6030
202,Zambia,620


# full indicators data 
with indicators aggregated by mean

### macro_data average
calculate average value of indicators for the period

In [45]:
indicators_list = ['SP.POP.TOTL']

In [47]:
# check shape of DF before and after replace NAN

print(f'size before{macro_data_df.shape}')

# # filter indicator
macro_data_df_clean = macro_data_df[macro_data_df['Indicator Code'].isin(indicators_list)]

# # replace and drop Nan
macro_data_df_clean = macro_data_df_clean.replace('', np.nan)
macro_data_df_clean = macro_data_df_clean.dropna(how="any")

print(f'size after{macro_data_df_clean.shape}')

size before(215556, 7)
size after(1736, 7)


In [49]:
macro_data_df_clean.head()

Unnamed: 0.1,Unnamed: 0,index,country,indicator,Indicator Code,years,Value
119,119,22358807,Afghanistan,"Population, total",SP.POP.TOTL,2016,34700612.0
270,270,22360303,Albania,"Population, total",SP.POP.TOTL,2016,2876101.0
423,423,22361799,Algeria,"Population, total",SP.POP.TOTL,2016,40850721.0
510,510,22363295,American Samoa,"Population, total",SP.POP.TOTL,2016,52245.0
611,611,22364791,Andorra,"Population, total",SP.POP.TOTL,2016,72181.0


In [53]:
macro_data_avg = macro_data_df_clean.groupby(['country', 'indicator'])['Value'].mean() 
macro_data_avg = macro_data_avg.reset_index()
macro_data_avg

Unnamed: 0,country,indicator,Value
0,Afghanistan,"Population, total",3.826146e+07
1,Albania,"Population, total",2.830413e+06
2,Algeria,"Population, total",4.359805e+07
3,American Samoa,"Population, total",4.997462e+04
4,Andorra,"Population, total",7.673562e+04
...,...,...,...
212,Virgin Islands (U.S.),"Population, total",1.063696e+05
213,West Bank and Gaza,"Population, total",4.751461e+06
214,"Yemen, Rep.","Population, total",3.566066e+07
215,Zambia,"Population, total",1.879788e+07


### pivot the table to have the indicators as columns

In [56]:
macro_data_avg_pivot = macro_data_avg.pivot(index=['country'], columns='indicator', values='Value')
macro_data_avg_pivot  = macro_data_avg_pivot.reset_index()
macro_data_avg_pivot

indicator,country,"Population, total"
0,Afghanistan,3.826146e+07
1,Albania,2.830413e+06
2,Algeria,4.359805e+07
3,American Samoa,4.997462e+04
4,Andorra,7.673562e+04
...,...,...
212,Virgin Islands (U.S.),1.063696e+05
213,West Bank and Gaza,4.751461e+06
214,"Yemen, Rep.",3.566066e+07
215,Zambia,1.879788e+07


### Add Immigration flow

In [66]:
imm_data= pd.merge(immigration_country, macro_data_avg_pivot, on='country')

imm_data

Unnamed: 0,country,immigration_flow,"Population, total"
0,Afghanistan,82620,3.826146e+07
1,Albania,5615,2.830413e+06
2,Algeria,40685,4.359805e+07
3,American Samoa,0,4.997462e+04
4,Andorra,0,7.673562e+04
...,...,...,...
199,Viet Nam,41975,9.745442e+07
200,West Bank and Gaza,6025,4.751461e+06
201,"Yemen, Rep.",6030,3.566066e+07
202,Zambia,620,1.879788e+07


### calculate a ratio of immigrants per 100K of the population
this will allow to compare between the countries

In [67]:
imm_data['immigration_100k'] = imm_data['immigration_flow'] / imm_data['Population, total'] * 100000



imm_data

Unnamed: 0,country,immigration_flow,"Population, total",immigration_100k
0,Afghanistan,82620,3.826146e+07,215.935289
1,Albania,5615,2.830413e+06,198.380969
2,Algeria,40685,4.359805e+07,93.318394
3,American Samoa,0,4.997462e+04,0.000000
4,Andorra,0,7.673562e+04,0.000000
...,...,...,...,...
199,Viet Nam,41975,9.745442e+07,43.071419
200,West Bank and Gaza,6025,4.751461e+06,126.803093
201,"Yemen, Rep.",6030,3.566066e+07,16.909389
202,Zambia,620,1.879788e+07,3.298244


In [70]:
imm_data_complete = pd.merge(imm_data, countries_df, on='country')

imm_data_complete = imm_data_complete[['country', 'iso3Code' ,'region', 'continent', 'immigration_flow', 'immigration_100k' ]]

imm_data_complete.head()

Unnamed: 0,country,iso3Code,region,continent,immigration_flow,immigration_100k
0,Afghanistan,AFG,South Asia,Asia,82620,215.935289
1,Albania,ALB,Europe & Central Asia,Europe,5615,198.380969
2,Algeria,DZA,Middle East & North Africa,Africa,40685,93.318394
3,American Samoa,ASM,East Asia & Pacific,Asia,0,0.0
4,Andorra,AND,Europe & Central Asia,Europe,0,0.0


In [72]:
## Export to csv
path = '../Resources_Output/immigration_data_for_period.csv'
imm_data_complete.to_csv(path)

print(f'year min : {imm_filtred['year'].min()}')
print(f'year max : {imm_filtred['year'].max()}')

year min : 2015
year max : 2024
