In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np

In [2]:
# Read file with Net Migration numbers with origin and destination
org_dest_csv = "Resources/UN_MigrantStockByOriginAndDestination_2017.csv"
# read a catalog of countries with their respective destinations
country_cat_csv = "Resources/Region_country_catalog.csv"

# Read data into Pandas data frames
org_dest_df = pd.read_csv(org_dest_csv)
country_cat_df = pd.read_csv(country_cat_csv)

In [6]:
# Columns with NO data come with "..", need to replace those with 0 
org_dest_df = org_dest_df.replace('..',0)

# Join migration data to the countries' catalog to get rid of all rows that DO NOT contain country information
# and to get region for destination countries
new_country_data = pd.merge(country_cat_df[["Country","Region"]],org_dest_df, on="Country")

In [14]:
# Use melt function to convert countries' columns into rows keeping values for "Country","Region" and "Year"

# Store the column names we want to melt in a list to be used in the melt operation
lst_ctry = new_country_data.columns[3:]

# melt columns into rows
origin_data_df = pd.melt(new_country_data,id_vars=["Country","Region","Year"], \
                    value_vars=lst_ctry, var_name='Origin', value_name='Tot Immigrants')

# Clean numbers of commas so they can be converted
origin_data_df.replace({'Tot Immigrants': ','}, {'Tot Immigrants': ''}, regex=True, inplace=True)
# Convert numbers to integers so they can be aggregated
origin_data_df[['Year','Tot Immigrants']] = origin_data_df[['Year','Tot Immigrants']].applymap(np.int32)

# Rename columns to make more sense and help with next merge
origin_data_df.columns = ["Destination","DRegion","Year","Country","Tot Immigrants"]

# merge with countries' catalog to get "Region", "Dev Level" and "Income Range" for countries of origin
origin_data_df = pd.merge(country_cat_df,origin_data_df, on="Country")
# rename the columns once we have added the region for the origination country
origin_data_df.columns = ["Origin","ORegion","Dev Level","Income Range","Destination","DRegion","Year","Tot Immigrants"]

# save the names of the top 25 countries of origination for filtering later on (considering all 12 years of data)
top25_ctrys_lst = origin_data_df.groupby("Origin").agg({'Tot Immigrants': 'sum'}).\
    sort_values('Tot Immigrants', ascending=False).head(25).index

Unnamed: 0,Origin,Tot Immigrants
0,India,55416860
1,Mexico,48741010
2,Russian Federation,41589224
3,China,35527020
4,Bangladesh,27254648
5,Ukraine,22810405
6,Pakistan,20804187
7,Philippines,19509515
8,Afghanistan,18497304
9,United Kingdom,18249798


In [10]:
# Pivot the years to be columns to help with plotting
top25_ctrys_lst = top25_ctrys_lst.groupby(["Origin","Dev Level","Income Range","Year"]).agg({'Tot Immigrants': 'sum'}). \
    sort_values('Tot Immigrants', ascending=False).unstack()

# Unstacking cause a multilevel, get rid of it
top25_ctrys_lst.columns = top25_ctrys_lst.columns.droplevel()

# Reset the index so country of origin become another column
top25_ctrys_lst = top25_ctrys_lst.reset_index()

In [35]:
# # Write the new dataframe to a new CSV file
# top_20_ctrys_df.to_csv("Output/Clean_Top20_migrant.csv", index=False, header=True)
# whereto_20_ctrys_df.to_csv("Output/whereto_20_ctrys.csv", index=False, header=True)
whereto_20_ctrys_df.to_csv("Output/top20_ctrys.csv", index=False, header=True)

In [None]:
# # Top 20 origination countries (highest total  number of migrants)
# top20_org_df = origin_data_df.groupby(["Origin"]).agg({'Tot Immigrants': 'sum'}).\
#      sort_values('Tot Immigrants', ascending=False).head(20).reset_index()

# # Top 20 countries receiving people
# top20_dest_df = origin_data_df.groupby("Destination").agg({'Net Immigrants': 'sum'}).\
#    sort_values('Tot Immigrants', ascending=False).head(20)
