## Exploration of UN migration dataset
### We use migration data of 2013, both genders. See [document](http://www.un.org/en/development/desa/population/migration/data/estimates/docs/OriginMIgrantStocks_Documentation.pdf).

In [7]:
#Import packages
import pandas as pd
import numpy as np
import networkx as nx
import binet as bt

import matplotlib.pyplot as plt
%matplotlib inline



## Countries data
### GDP PC and GDP PC growth

In [86]:
#Country facts
c_table1 = pd.read_csv('CountryData1.csv')
c_table1 = c_table1.rename(columns={'Country code': 'M49'})
c_table2 = pd.read_table('CountryData2.txt', delimiter=',')
c_table = c_table1.merge(c_table2)[['Country or area','ISO3166-1-Alpha-2','ISO3166-1-Alpha-3','Major area','Region','Capital']]
c_table['ccode'] = c_table['ISO3166-1-Alpha-3'].str.lower()
c_table = c_table.drop('ISO3166-1-Alpha-3', 1)

#GDP per capita, current US$.
GDPPC = pd.read_csv('GDPPC-countries.csv')
GDPPC = GDPPC[['Country','2002','2003', '2004', '2005', '2006', '2007', '2008',
               '2009', '2010', '2011', '2012', '2013','2014']].dropna()

#Build period-mean GDPPC dataset
pmGDP = pd.DataFrame()

for y in [2003, 2006, 2009, 2012]:
    GDPPC_ = pd.concat([GDPPC.Country, GDPPC[[str(y),str(y + 1),str(y + 2)]].mean(axis = 1)],axis = 1)
    GDPPC_ = GDPPC_.rename(columns={0: 'GDPPC_mean_'+str(y)})
    if pmGDP.empty:
        pmGDP = pmGDP.append(GDPPC_)
    else:
        pmGDP = pmGDP.merge(GDPPC_)

pmGDP = pmGDP.rename(columns={'Country': 'Country or area'})
c_table = c_table.merge(pmGDP)

#Build period GDPPC growth dataset
pmGDPgrowth = pd.DataFrame()

for y in [2003, 2006, 2009, 2012]:
    GDPPCgrowth_ = pd.concat([GDPPC.Country, GDPPC[str(y + 2)].divide(GDPPC[str(y - 1)]) - 1],axis = 1)
    GDPPCgrowth_ = GDPPCgrowth_.rename(columns={0: 'GDPPC_tot_growth_'+str(y)})
    if pmGDPgrowth.empty:
        pmGDPgrowth = pmGDPgrowth.append(GDPPCgrowth_)
    else:
        pmGDPgrowth = pmGDPgrowth.merge(GDPPCgrowth_)

pmGDPgrowth = pmGDPgrowth.rename(columns={'Country': 'Country or area'})
c_table = c_table.merge(pmGDPgrowth)


### Lat long

In [87]:
#Coords of capital cities for plotting
capital_coords = pd.read_csv('CountryCapitals.csv')[['CapitalLatitude', 'CapitalLongitude', 'CountryCode']]
capital_coords = capital_coords.rename(columns={'CountryCode': 'ISO3166-1-Alpha-2','CapitalLongitude': 'lng','CapitalLatitude': 'lat'})
c_table = c_table.merge(capital_coords, how = 'left')

### Complexity indices

In [None]:
#Import data
wt, pr, co = bt.trade_data('hs02')

In [90]:
comp = pd.DataFrame()

for y in [2003, 2006, 2009, 2012]:
    data = wt[wt['year'].between(y,y + 2)][['ccode','pcode','x']].groupby(['ccode','pcode']).sum()[['x']].reset_index()
    M = bt.mcp(data=data,c='ccode',p='pcode')
    ECI,PCI = M.CalculateComplexity()
    ECI = ECI.rename(columns={'ECI': 'ECI_'+str(y)})
    if comp.empty:
        comp = comp.append(ECI)
    else:
        comp = comp.merge(ECI)
comp[['ECI_2003_rank','ECI_2006_rank','ECI_2009_rank','ECI_2012_rank']] = comp.drop('ccode', axis = 1).rank(ascending = False).astype(int)

c_table = c_table.merge(comp)

c_table = c_table.dropna().reset_index(drop = True)

## Network stats
### From migration data
#### Here we can introduce a weighting variable. For example, what if we weight inmigration by GDP PC of country of origin?

In [98]:
#Main dataset
migration2013 = pd.read_csv('migrationUN2013.csv')
migration2010 = pd.read_csv('migrationUN2010.csv')

def cleanup_migrationUN(df):
    """
    Clean up to give it the shape of an adjacency matrix (ideal for network statistics)
    """
    df = df.loc[df['Major area, region, country or area of destination'].isin(c_table['Country or area'])]
    df = df.set_index('Major area, region, country or area of destination')
    df = df[c_table['Country or area']]

    for col in df.columns[df.dtypes == object]:
        df[col] = df[col].str.replace(" ","")
    df = df.fillna(0)
    df = df.sort_index()
    return df

migration2010 = cleanup_migrationUN(migration2010)
migration2013 = cleanup_migrationUN(migration2013)

def weight_columns(df, series):
    """
    Multiply columns (inmigrants in UN migration datasets) with some variable.
    
    Example:
        migration2010_w = weight_columns(migration2010, c_table['GDPPC_mean_2012'])
    
    """
    df_w = df.apply(lambda x: np.asarray(x, dtype = int) * np.asarray(series
                                                                  , dtype = int), axis=1)
    return df_w

migration2010_w = weight_columns(migration2010, c_table['GDPPC_mean_2012'])
migration2013_w = weight_columns(migration2013, c_table['GDPPC_mean_2012'])

### Function that computes a bunch of network statistics out of the adjacency matrix dataframe

In [4]:
def nw_stats_from_adj_dataframe(df, label = ''):
    adj_mat = df.as_matrix().astype(int)
    G = nx.from_numpy_matrix(adj_mat, create_using=nx.DiGraph())
    nx.set_node_attributes(G, 'Country or area', dict(enumerate(df.index)))
    nx.set_node_attributes(G, 'degree_centrality'+label, nx.degree_centrality(G))
    nx.set_node_attributes(G, 'closeness_centrality'+label, nx.closeness_centrality(G))
    nx.set_node_attributes(G, 'betweenness_centrality'+label, nx.betweenness_centrality(G, weight='weight'))
    nx.set_node_attributes(G, 'eigenvector_centrality'+label, nx.eigenvector_centrality(G, weight='weight'))

    centralities = pd.DataFrame([r[1] for r in G.nodes(data = True)])

    centralities['avg_cent_rank'+label] = centralities.drop('Country or area', axis=1).rank(ascending = False).astype(int).mean(axis = 1)
    # centralities.sort_values(by = 'avg_cent_rank')
    
    nx.set_node_attributes(G,  'in_size'+label,  G.in_degree(weight='weight'))
    nx.set_node_attributes(G, 'out_size'+label, G.out_degree(weight='weight'))

    sizes = pd.DataFrame([r[1] for r in G.nodes(data = True)])[['Country or area','in_size'+label,'out_size'+label]]
    sizes[['in_size_rank'+label,'out_size_rank'+label]] = sizes[['in_size'+label,'out_size'+label]].rank(ascending = False).astype(int)

    stats = centralities.merge(sizes)
    
    return stats

In [147]:
#Compute unweighted and weighted network statistics. 
stats2010 = nw_stats_from_adj_dataframe(migration2010, label = '_2010')
stats2010_w = nw_stats_from_adj_dataframe(migration2010_w, label = '_2010GDPw')

stats2013 = nw_stats_from_adj_dataframe(migration2013, label = '_2013')
stats2013_w = nw_stats_from_adj_dataframe(migration2013_w, label = '_2013GDPw')

stats = pd.concat([stats2010, stats2010_w, stats2013, stats2013_w], axis = 1).T.drop_duplicates().T

np.seterr(divide='ignore', invalid='ignore');

stats['betweenness_centrality_2013'] = np.log10(stats['betweenness_centrality_2013'].astype(float))
stats['eigenvector_centrality_2013'] = np.log10(stats['eigenvector_centrality_2013'].astype(float))
stats['in_size_2013'] = np.log10(stats['in_size_2013'].astype(float))
stats['out_size_2013'] = np.log10(stats['out_size_2013'].astype(float))
stats['betweenness_centrality_2013GDPw'] = np.log10(stats['betweenness_centrality_2013GDPw'].astype(float))
stats['eigenvector_centrality_2013GDPw'] = np.log10(stats['eigenvector_centrality_2013GDPw'].astype(float))
stats['in_size_2013GDPw'] = np.log10(stats['in_size_2013GDPw'].astype(float))
stats['out_size_2013GDPw'] = np.log10(stats['out_size_2013GDPw'].astype(float))

# Export the resulting table
# stats.to_csv('MigrationNwStats.csv')

### Compare variables in to be used as X and Y among them, so to select a meaningful subset

In [1]:
import itertools

def main():
    data = stats.as_matrix().T[17:]
    numvars, numdata = data.shape
    fig = scatterplot_matrix(data, list(stats.columns[17:]),
            linestyle='none', marker='.', color='black', mfc='none')
#     fig.suptitle('')
    plt.show()

def scatterplot_matrix(data, names, **kwargs):
    """Plots a scatterplot matrix of subplots.  Each row of "data" is plotted
    against other rows, resulting in a nrows by nrows grid of subplots with the
    diagonal subplots labeled with "names".  Additional keyword arguments are
    passed on to matplotlib's "plot" command. Returns the matplotlib figure
    object containg the subplot grid."""
    numvars, numdata = data.shape
    fig, axes = plt.subplots(nrows=numvars, ncols=numvars, figsize=(numvars,numvars))
    fig.subplots_adjust(hspace=0.05, wspace=0.05)

    for ax in axes.flat:
        # Hide all ticks and labels
        ax.xaxis.set_visible(False)
        ax.yaxis.set_visible(False)

        # Set up ticks only on one side for the "edge" subplots...
        if ax.is_first_col():
            ax.yaxis.set_ticks_position('left')
        if ax.is_last_col():
            ax.yaxis.set_ticks_position('right')
        if ax.is_first_row():
            ax.xaxis.set_ticks_position('top')
        if ax.is_last_row():
            ax.xaxis.set_ticks_position('bottom')

    # Plot the data.
    for i, j in zip(*np.triu_indices_from(axes, k=1)):
        for x, y in [(i,j), (j,i)]:
            axes[x,y].plot(data[x], data[y], **kwargs)

    # Label the diagonal subplots...
    for i, label in enumerate(names):
        axes[i,i].annotate(label, (0.5, 0.5), xycoords='axes fraction',
                ha='center', va='center')

    # Turn on the proper x or y axes ticks.
    for i, j in zip(range(numvars), itertools.cycle((-1, 0))):
        axes[j,i].xaxis.set_visible(True)
        axes[i,j].yaxis.set_visible(True)

    return fig

main()

NameError: global name 'stats' is not defined

### Create graph (network) and export to plot it easily.

In [7]:
adj_mat = df_w.as_matrix().astype(int) #adjacency matrix
G = nx.from_numpy_matrix(adj_mat, create_using=nx.DiGraph())

#create list and then dicts with node attributes: name/label, longitude and latitude
node_name = {}; node_lng = {}; node_lat = {};
for index, row in c_table.iterrows():
    node_name[index] = row['ISO3166-1-Alpha-2']
    node_lng[index] = row['lng']
    node_lat[index] = row['lat']

node_name = {Urc_key: node_name[Urc_key] for Urc_key in G.nodes()}
node_lng = {Urc_key: node_lng[Urc_key] for Urc_key in G.nodes()}
node_lat = {Urc_key: node_lat[Urc_key] for Urc_key in G.nodes()}

#set the attributes to the graph nodes
nx.set_node_attributes(G, 'Label', node_name)
nx.set_node_attributes(G, 'lng', node_lng)
nx.set_node_attributes(G, 'lat', node_lat)

#export as .gexf
nx.write_gexf(G, 'migrationnw_w.gexf')


lat
double
node
{}
static
lng
double
node
{}
static
Label
string
node
{}
static
lat
double
node
{}
static
lng
double
node
{}
static
Label
string
node
{}
static
lat
double
node
{}
static
lng
double
node
{}
static
Label
string
node
{}
static
lat
double
node
{}
static
lng
double
node
{}
static
Label
string
node
{}
static
lat
double
node
{}
static
lng
double
node
{}
static
Label
string
node
{}
static
lat
double
node
{}
static
lng
double
node
{}
static
Label
string
node
{}
static
lat
double
node
{}
static
lng
double
node
{}
static
Label
string
node
{}
static
lat
double
node
{}
static
lng
double
node
{}
static
Label
string
node
{}
static
lat
double
node
{}
static
lng
double
node
{}
static
Label
string
node
{}
static
lat
double
node
{}
static
lng
double
node
{}
static
Label
string
node
{}
static
lat
double
node
{}
static
lng
double
node
{}
static
Label
string
node
{}
static
lat
double
node
{}
static
lng
double
node
{}
static
Label
string
node
{}
static
lat
double
node
{}
static
lng
double
nod