In [1]:
## Dependencies

import pandas as pd
import pathlib as path

import requests
import json
from pprint import pprint

import numpy as np

from io import StringIO

# to read the ZIP file
import requests
import zipfile
import io



# (1) create DF for countries

import from csv file 

In [2]:

path = '../Resources_Output/countries_UN_referential.csv'
countries_wb_df = pd.read_csv(path)

countries_wb_df.head()

Unnamed: 0,iso3Code,iso2Code,country,region,capitalCity,longitude,latitude
0,ABW,AW,Aruba,Latin America & Caribbean,Oranjestad,-70.0167,12.5167
1,AFG,AF,Afghanistan,South Asia,Kabul,69.1761,34.5228
2,AGO,AO,Angola,Sub-Saharan Africa,Luanda,13.242,-8.81155
3,ALB,AL,Albania,Europe & Central Asia,Tirane,19.8172,41.3317
4,AND,AD,Andorra,Europe & Central Asia,Andorra la Vella,1.5218,42.5075


# (2) collect immigration data 

##### Data source from IRCC web site
this file is updated on a monthly basis by IRCC

https://www.ircc.canada.ca/opendata-donneesouvertes/data/ODP-PR-Citz.csv <br>
from<br>
https://ouvert.canada.ca/data/fr/dataset/f7e5498e-0ad8-4417-85c9-9b8aff9b9eda/resource/5f5fa9ca-b621-4dac-91d1-25654a25735c

In [3]:
### import and read csv file from web page

path_imm = 'https://www.ircc.canada.ca/opendata-donneesouvertes/data/ODP-PR-Citz.csv'
imm_df = pd.read_csv(path_imm, delimiter='\t')

# imm_df.to_csv('../Output/immigration_raw_data.csv')
imm_df.head(5)

Unnamed: 0,EN_YEAR,EN_QUARTER,EN_MONTH,FR_ANNEÉ,FR_TRIMESTRE,FR_MOIS,EN_COUNTRY_OF_CITIZENSHIP,FR_PAYS_DE_CITOYENNETÉ,TOTAL
0,2015,Q1,Feb,2015,T1,fév.,Afghanistan,Afghanistan,125
1,2015,Q1,Feb,2015,T1,fév.,Albania,Albanie,25
2,2015,Q1,Feb,2015,T1,fév.,Algeria,Algérie,125
3,2015,Q1,Feb,2015,T1,fév.,Antigua and Barbuda,Antigua-et-Barbuda,5
4,2015,Q1,Feb,2015,T1,fév.,Argentina,Argentine,15


In [4]:
# drop useless columns

imm_df = imm_df.drop(columns=['FR_ANNEÉ','FR_TRIMESTRE','FR_MOIS','FR_PAYS_DE_CITOYENNETÉ'])
imm_df.head(5)

Unnamed: 0,EN_YEAR,EN_QUARTER,EN_MONTH,EN_COUNTRY_OF_CITIZENSHIP,TOTAL
0,2015,Q1,Feb,Afghanistan,125
1,2015,Q1,Feb,Albania,25
2,2015,Q1,Feb,Algeria,125
3,2015,Q1,Feb,Antigua and Barbuda,5
4,2015,Q1,Feb,Argentina,15


In [5]:
# check empty values with 1 example raw 42

print(imm_df.loc[42,])

EN_YEAR                          2015
EN_QUARTER                         Q1
EN_MONTH                          Feb
EN_COUNTRY_OF_CITIZENSHIP    Dominica
TOTAL                              --
Name: 42, dtype: object


In [6]:
imm_clean_df = imm_df.copy()

In [7]:
# convert Total to numeric

imm_clean_df['TOTAL'] = pd.to_numeric(imm_clean_df['TOTAL'], errors='coerce')

# replace NAN with 0 and change data type

imm_clean_df = imm_clean_df.fillna({'TOTAL': 0})
imm_clean_df = imm_clean_df.astype({'TOTAL': 'int64'})
print(imm_clean_df.loc[42,])

EN_YEAR                          2015
EN_QUARTER                         Q1
EN_MONTH                          Feb
EN_COUNTRY_OF_CITIZENSHIP    Dominica
TOTAL                               0
Name: 42, dtype: object


In [8]:
# check that total is numeric _ integer

imm_clean_df.dtypes

EN_YEAR                       int64
EN_QUARTER                   object
EN_MONTH                     object
EN_COUNTRY_OF_CITIZENSHIP    object
TOTAL                         int64
dtype: object

In [9]:
# rename columns 

imm_clean_df = imm_clean_df.rename(columns={
                                            'EN_YEAR'            :           'year' ,
                                            'EN_QUARTER'         :          'quarter' , 
                                            'EN_MONTH'           :          'month_str' ,
                                            'EN_COUNTRY_OF_CITIZENSHIP' :   'country' ,  
                                            'TOTAL'              :           'immigration_flow'
                                            })
imm_clean_df.dtypes

year                 int64
quarter             object
month_str           object
country             object
immigration_flow     int64
dtype: object

#### 2.1 the steps below are needed to match the name of the countries between this DF and countries_df

example : IRCC name is 'Bahama Islands, The'  
for world bank :   'Bahamas, The' ,
because we will use country as column to merge, we need to harmonize the countries names

1st we identify the countries names that mismatch
then we change them
the preparation of the mapping is done manually

In [10]:
# to identify the countries names that mismatch between IRCC and World bank

countries_wb = pd.DataFrame(countries_wb_df['country'].unique() , columns=['country'])
countries_ircc = pd.DataFrame(imm_clean_df['country'].unique(),  columns=['country'])

# Add a source column to indicate the origin
countries_wb['source'] = 'countries_wb'
countries_ircc['source'] = 'countries_ircc'

# Concatenate the two DataFrames
all_countries = pd.concat([countries_wb, countries_ircc], ignore_index=True)

# Find non-corresponding values
non_corresponding_values = all_countries.groupby('country').filter(lambda x: len(x) == 1)

# Display the non-corresponding values with their origin
print(non_corresponding_values)
# non_corresponding_values.to_csv('Output/non_corresponding_countries.csv')

            country          source
0             Aruba    countries_wb
8    American Samoa    countries_wb
15            Benin    countries_wb
16     Burkina Faso    countries_wb
20     Bahamas, The    countries_wb
..              ...             ...
423        Holy See  countries_ircc
424          Azores  countries_ircc
427      Guadeloupe  countries_ircc
431         Reunion  countries_ircc
432    Sint-Maarten  countries_ircc

[158 rows x 2 columns]


In [12]:
# Change countries name to match to th United Nations standard

imm_clean_df['country'] = imm_clean_df['country'].replace({
                            
                                    'Azores'  :  'Canada' ,
                                    'Bahama Islands, The'  :  'Bahamas, The' ,
                                    'Benin, Republic of'  :  'Benin' ,
                                    'Bosnia-Herzegovina'  :  'Bosnia and Herzegovina' ,
                                    'Botswana, Republic of'  :  'Botswana' ,
                                    'Brunei'  :  'Brunei Darussalam' ,
                                    'Burkina-Faso'  :  'Burkina Faso' ,
                                    'Cameroon, Federal Republic of'  :  'Cameroon' ,
                                    'Cape Verde Islands'  :  'Cabo Verde' ,
                                    'Chad, Republic of'  :  'Chad' ,
                                    "China, People's Republic of"  :  'China' ,
                                    'Congo, Democratic Republic of the'  :  'Congo, Dem. Rep.' ,
                                    "Congo, People's Republic of the"  :  'Congo, Rep.' ,
                                    'Country not stated'  :  'Canada' ,
                                    'Czech Republic'  :  'Czechia' ,
                                    'Djibouti, Republic of'  :  'Djibouti' ,
                                    'East Timor, Democratic Republic of'  :  'Timor-Leste' ,
                                    'Egypt'  :  'Egypt, Arab Rep.' ,
                                    'Equatorial Guinea, Republic of'  :  'Equatorial Guinea' ,
                                    'Gabon Republic'  :  'Gabon' ,
                                    'Gambia'  :  'Gambia, The' ,
                                    'Guadeloupe'  :  'France' ,
                                    'Guinea, Republic of'  :  'Guinea' ,
                                    'Holy See'  :  'Canada' ,
                                    'Hong Kong SAR'  :  'Hong Kong SAR, China' ,
                                    'Indonesia, Republic of'  :  'Indonesia' ,
                                    'Iran'  :  'Iran, Islamic Rep.' ,
                                    'Ireland, Republic of'  :  'Ireland' ,
                                    'Ivory Coast, Republic of'  :  "Cote d'Ivoire" ,
                                    "Korea, People's Democratic Republic of"  :  "Korea, Dem. People's Rep." ,
                                    'Korea, Republic of'  :  'Korea, Rep.' ,
                                    'Kosovo, Republic of'  :  'Kosovo' ,
                                    'Kyrgyzstan'  :  'Kyrgyz Republic' ,
                                    'Laos'  :  'Lao PDR' ,
                                    'Macau SAR'  :  'Macao SAR, China' ,
                                    'Macedonia'  :  'North Macedonia' ,
                                    'Maldives, Republic of'  :  'Maldives' ,
                                    'Mali, Republic of'  :  'Mali' ,
                                    'Marshall Islands, Republic of the'  :  'Marshall Islands' ,
                                    'Micronesia, Federated States of'  :  'Canada' ,
                                    "Mongolia, People's Republic of"  :  'Mongolia' ,
                                    'Montenegro, Republic of'  :  'Montenegro' ,
                                    'Myanmar (Burma)'  :  'Myanmar' ,
                                    'Netherlands Antilles, The'  :  'Canada' ,
                                    'Netherlands, The'  :  'Netherlands' ,
                                    'Nevis'  :  'Canada' ,
                                    'Niger, Republic of the'  :  'Niger' ,
                                    'Northern Mariana Islands, Commonwealth of the'  :  'Northern Mariana Islands' ,
                                    'Other'  :  'Canada' ,
                                    'Palau, Republic of'  :  'Palau' ,
                                    'Palestinian Authority (Gaza/West Bank)'  :  'West Bank and Gaza' ,
                                    'Panama, Republic of'  :  'Panama' ,
                                    'Reunion'  :  'France' ,
                                    'Russia'  :  'Russian Federation' ,
                                    'Samoa, American'  :  'Samoa' ,
                                    'Samoa, Independent State of'  :  'American Samoa' ,
                                    'Serbia, Republic of'  :  'Serbia' ,
                                    'Sint-Maarten'  :  'Sint Maarten (Dutch part)' ,
                                    'Somalia, Democratic Republic of'  :  'Somalia' ,
                                    'South Africa, Republic of'  :  'South Africa' ,
                                    'South Sudan, Republic of'  :  'South Sudan' ,
                                    'St. Kitts-Nevis'  :  'Canada' ,
                                    'Stateless'  :  'Canada' ,
                                    'Sudan, Democratic Republic of'  :  'Sudan' ,
                                    'Surinam'  :  'Suriname' ,
                                    'Swaziland'  :  'Somalia' ,
                                    'Syria'  :  'Syrian Arab Republic' ,
                                    'Taiwan'  :  'China' ,
                                    'Tanzania, United Republic of'  :  'Tanzania' ,
                                    'Togo, Republic of'  :  'Togo' ,
                                    'Trinidad and Tobago, Republic of'  :  'Trinidad and Tobago' ,
                                    'Turkey'  :  'Turkiye' ,
                                    'United Kingdom and Overseas Territories'  :  'United Kingdom' ,
                                    'United States of America'  :  'United States' ,
                                    'Venezuela'  :  'Venezuela, RB' ,
                                    'Vietnam'  :  'Viet Nam' ,
                                    'Virgin Islands, British'  :  'British Virgin Islands' ,
                                    'Western Sahara'  :  'Canada' ,
                                    'Yemen'  :  'Yemen, Rep.' ,


                                     })



In [13]:
# chech if the modification is effiscinet : we should have only countries_wb source

countries_wb = pd.DataFrame(countries_wb_df['country'].unique() , columns=['country'])
countries_ircc = pd.DataFrame(imm_clean_df['country'].unique(),  columns=['country'])

# Add a source column to indicate the origin
countries_wb['source'] = 'countries_wb'
countries_ircc['source'] = 'countries_ircc'

# Concatenate the two DataFrames
all_countries = pd.concat([countries_wb, countries_ircc], ignore_index=True)

# Find non-corresponding values
non_corresponding_values = all_countries.groupby('country').filter(lambda x: len(x) == 1)

# Display the non-corresponding values with their origin
print(non_corresponding_values)  ## you must have only countries_wb source


                      country        source
0                       Aruba  countries_wb
34            Channel Islands  countries_wb
47             Cayman Islands  countries_wb
65              Faroe Islands  countries_wb
66      Micronesia, Fed. Sts.  countries_wb
71                  Gibraltar  countries_wb
78                  Greenland  countries_wb
80                       Guam  countries_wb
88                Isle of Man  countries_wb
104       St. Kitts and Nevis  countries_wb
119  St. Martin (French part)  countries_wb
183                  Eswatini  countries_wb
208     Virgin Islands (U.S.)  countries_wb


#### 2.2 other cleaning steps

In [14]:
# add months with numerical values
 ## check that there is no strange value 

months_str = imm_clean_df['month_str'].unique()
months_str

array(['Feb', 'Jan', 'Mar', 'Apr', 'Jun', 'May', 'Aug', 'Jul', 'Sep',
       'Dec', 'Nov', 'Oct'], dtype=object)

In [15]:
# use of Mapping to add integers to month abbreviations

month_mapping = {
    'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
    'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
}

# Map month abbreviations to integers using the dictionary
imm_clean_df['month_int'] = imm_clean_df['month_str'].map(month_mapping)

imm_clean_df.head(5)

Unnamed: 0,year,quarter,month_str,country,immigration_flow,month_int
0,2015,Q1,Feb,Afghanistan,125,2
1,2015,Q1,Feb,Albania,25,2
2,2015,Q1,Feb,Algeria,125,2
3,2015,Q1,Feb,Antigua and Barbuda,5,2
4,2015,Q1,Feb,Argentina,15,2


In [16]:
# organize columns

imm_clean_df = imm_clean_df[['country', 'year', 'month_str', 
                            'month_int','quarter', 'immigration_flow']]
imm_clean_df.head(5)

Unnamed: 0,country,year,month_str,month_int,quarter,immigration_flow
0,Afghanistan,2015,Feb,2,Q1,125
1,Albania,2015,Feb,2,Q1,25
2,Algeria,2015,Feb,2,Q1,125
3,Antigua and Barbuda,2015,Feb,2,Q1,5
4,Argentina,2015,Feb,2,Q1,15


In [17]:
# this step is usefull to set a key during the DB building

imm_new_df = imm_clean_df.reset_index()
imm_new_df.head(5)

Unnamed: 0,index,country,year,month_str,month_int,quarter,immigration_flow
0,0,Afghanistan,2015,Feb,2,Q1,125
1,1,Albania,2015,Feb,2,Q1,25
2,2,Algeria,2015,Feb,2,Q1,125
3,3,Antigua and Barbuda,2015,Feb,2,Q1,5
4,4,Argentina,2015,Feb,2,Q1,15


In [18]:
# export to CSV

imm_new_df.to_csv('../Resources_Output/immigrants_by_country_monthly.csv', index=False)