In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np

In [2]:
# Read file with Net Migration numbers with origin and destination
org_dest_csv = "Resources/UN_MigrantStockByOriginAndDestination_2017.csv"
# read a catalog of countries with their respective destinations
country_cat_csv = "Resources/Region_country_catalog.csv"

# Read data into Pandas data frames
org_dest_df = pd.read_csv(org_dest_csv)
country_cat_df = pd.read_csv(country_cat_csv)

In [3]:
# Columns with NO data come with "..", need to replace those with 0 
org_dest_df = org_dest_df.replace('..',0)

# Join migration data to the countries' catalog to get rid of all rows that DO NOT contain country information
# and to get region for destination countries
new_country_data = pd.merge(country_cat_df[["Country","Region"]],org_dest_df, on="Country")

In [4]:
# Use melt function to convert countries' columns into rows keeping values for "Country","Region" and "Year"

# Store the column names we want to melt in a list to be used in the melt operation
lst_ctry = new_country_data.columns[3:]

# melt columns into rows
origin_data_df = pd.melt(new_country_data,id_vars=["Country","Region","Year"], \
                    value_vars=lst_ctry, var_name='Origin', value_name='Tot Immigrants')

# Clean numbers of commas so they can be converted
origin_data_df.replace({'Tot Immigrants': ','}, {'Tot Immigrants': ''}, regex=True, inplace=True)
# Convert numbers to integers so they can be aggregated
origin_data_df[['Year','Tot Immigrants']] = origin_data_df[['Year','Tot Immigrants']].applymap(np.int32)

# Rename columns to make more sense and help with next merge
origin_data_df.columns = ["Destination","DRegion","Year","Country","Tot Immigrants"]

# merge with countries' catalog to get "Region", "Dev Level" and "Income Range" for countries of origin
origin_data_df = pd.merge(country_cat_df,origin_data_df, on="Country")
# rename the columns once we have added the region for the origination country
origin_data_df.columns = ["Origin","ORegion","Dev Level","Income Range","Destination","DRegion","Year","Tot Immigrants"]

In [5]:
# Origination countries ordered from highest to lowest numbers of migrants
origination_df = origin_data_df.groupby(["Origin"]).agg({'Tot Immigrants': 'sum'}).\
     sort_values('Tot Immigrants', ascending=False).reset_index()

# Dsetination countries ordered from highest to lowest numbers of migrants received
destination_df = origin_data_df.groupby("Destination").agg({'Tot Immigrants': 'sum'}).\
   sort_values('Tot Immigrants', ascending=False).reset_index()

In [6]:
# Pivot the years to be columns to help with plotting
origin_data_df = origin_data_df.groupby(["Origin","Dev Level","Income Range","Year"]).agg({'Tot Immigrants': 'sum'}). \
    sort_values('Tot Immigrants', ascending=False).unstack()

# Unstacking cause a multilevel, get rid of it
origin_data_df.columns = origin_data_df.columns.droplevel()

# Reset the index so country of origin become another column (to save the data)
origin_data_df = origin_data_df.reset_index()

In [7]:
# Clean GDP data
# Read file with countries' GDP numbers
gdp_data_csv = "Resources/CountryGDP.csv"

# Read data into Pandas data frames
gdp_data_df = pd.read_csv(gdp_data_csv)

# join the GDP data to the catalog to get rid of all rows that DO NOT contain country information
gdp_total_df = pd.merge(country_cat_df[["Country"]],gdp_data_df, on="Country")

In [8]:
# Determine countries with missing indicators
ctrs_to_drop = gdp_total_df[(gdp_total_df['2005'].isna()) | gdp_total_df['2010'].isna() | \
             (gdp_total_df['2015'].isna()) | gdp_total_df['2017'].isna() ]["Country"].unique()

# Delete countries with any missing indicator
gdp_total_df = gdp_total_df.loc[~gdp_total_df["Country"].isin(ctrs_to_drop)]

# Indicators with amounts will be expressed in billions
gdp_total_df.loc[~gdp_total_df["Indicator"].str.contains('%'), ["2005","2010","2015","2017"]] /= 1000000000

# Merge both sets
all_info_df = pd.merge(origin_data_df,gdp_total_df, left_on="Origin", right_on="Country").drop("Country",axis=1)

In [9]:
# Determine top 30 countries of origination. Since some did NOT have indicator data, we might lose some
top30_ctrys_lst = origination_df["Origin"].head(30)

# Filter top 30 migrant countries GDP dataframe 
# Possible indicators: Personal remittances, received (% of GDP), Personal remittances, received (current US$),
# GDP growth (annual %), GDP (current US$)
all_info_df = all_info_df.loc[(all_info_df.Origin.isin(top30_ctrys_lst))]

In [10]:
# Redetermine top 20 countries of origination (some had to be dropped due to lack of indicators)
origination_df = origination_df.loc[(origination_df.Origin.isin(all_info_df.Origin.head(80).unique()))]

In [12]:
# Write data of Top 20countries of origination to a file
origination_df.to_csv("Output/Top20_Origin.csv", index=False, header=True)

# Write data of Top 20countries of destination to a file
destination_df.head(20).to_csv("Output/Top20_Dest.csv", index=False, header=True)

# Write migrant and economic data of top20 countries to a file
all_info_df.head(80).to_csv("Output/Data_for_plotting.csv", index=False, header=True)