In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np

In [2]:
# Read file with Net Migration numbers with origin and destination
org_dest_csv = "Resources/UN_MigrantStockByOriginAndDestination_2017.csv"
# read a catalog of countries with their respective destinations
country_cat_csv = "Resources/Region_country_catalog.csv"

# Read data into Pandas data frames
org_dest_df = pd.read_csv(org_dest_csv)
country_cat_df = pd.read_csv(country_cat_csv)

In [3]:
# Columns with NO data come with "..", need to replace those with 0 
org_dest_df = org_dest_df.replace('..',0)

# join the numbers data to the catalog to get rid of all rows that DO NOT contain country information
# and to get region for destination
new_country_data = pd.merge(country_cat_df,org_dest_df, on="Country")

In [4]:
# Use melt function to convert countries' columns into rows keeping values for "Country","Region" and "Year"

# Store the column names we want to melt in a list to be used in the melt operation
lst_ctry = new_country_data.columns[3:]

# melt columns into rows
origin_data_df = pd.melt(new_country_data,id_vars=["Country","Region","Year"], value_vars=lst_ctry,\
                    var_name='Origin', value_name='Net Immigrants')
# Clean numbers of commas so they can be converted
origin_data_df.replace({'Net Immigrants': ','}, {'Net Immigrants': ''}, regex=True, inplace=True)
# Convert numbers to integers so they can be ussed in mathematical operations
origin_data_df[['Year','Net Immigrants']] = origin_data_df[['Year','Net Immigrants']].applymap(np.int64)
# Rename columns to make more sense and help with next merge
origin_data_df.columns = ["Destination","DRegion","Year","Country","Net Immigrants"]

# merge with country catalog to get region for country of origin
origin_data_df = pd.merge(country_cat_df,origin_data_df, on="Country")
# rename the columns once we have added the region for the origination country
origin_data_df.columns = ["Origin","ORegion","Destination","DRegion","Year","Net Immigrants"]

# save the top 20 countries of origination for filtering later on (considering all 12 years of data)
top20_ctrys_lst = origin_data_df.groupby("Origin").agg({'Net Immigrants': 'sum'}).\
    sort_values('Net Immigrants', ascending=False).head(20).index

#origin_data_df.nlargest(20, "Net Immigrants")


# origin_data_df.groupby("Country").agg({'Net Immigrants': 'sum'}).\
#     sort_values('Net Immigrants', ascending=False).head(20)

#origin_data_df.head()

In [5]:
# total number of origination migrants per region 
#origin_data_df.groupby("ORegion").agg({'Net Immigrants': 'sum'}).sort_values('Net Immigrants', ascending=False)

# Filter#origin_data_df.head()

In [6]:
# # Top 10 countries receiving people
#origin_data_df.groupby("Destination").agg({'Net Immigrants': 'sum'}).\
#    sort_values('Net Immigrants', ascending=False).head(10)
# # Show the destination region sorted by net number of migrants 
#origin_data_df.groupby("DRegion").agg({'Net Immigrants': 'sum'}).sort_values('Net Immigrants', ascending=False)

In [13]:
# Filter the dataframe to continue with the top 20 ONLY
top_20_ctrys_df = origin_data_df.loc[origin_data_df.Origin.isin(top20_ctrys_lst)]

# Determine from the top 20, which are the 3 regions they are mainly going
whereto_20_ctrys_df = top_20_ctrys_df.groupby(["DRegion","Destination","ORegion","Origin"]).\
    agg({'Net Immigrants': 'sum'}).sort_values(['Origin','Net Immigrants'], ascending=[True,False]).reset_index().\
    groupby(["Origin","DRegion"]).agg({'Net Immigrants': 'sum'}).\
    sort_values(['Origin','Net Immigrants'], ascending=[True,False]).reset_index().groupby(["Origin"]).head(3)


# whereto_20_ctrys_df.head()

In [14]:
top_20_ctrys_df = top_20_ctrys_df.groupby(["Origin","Year"]).agg({'Net Immigrants': 'sum'}). \
    sort_values('Net Immigrants', ascending=False).unstack()

top_20_ctrys_df.columns = top_20_ctrys_df.columns.droplevel()

top_20_ctrys_df = top_20_ctrys_df.reset_index()

In [17]:
# Write the new dataframe to a new CSV file
top_20_ctrys_df.to_csv("Output/Clean_Top20_migrant.csv", index=False, header=True)

AttributeError: 'Index' object has no attribute 'to_csv'

pandas.core.indexes.base.Index