In [1]:
import plotly.plotly as py
import pandas as pd
import pycountry
import numpy as np

Our goal is to be able to understand how different socio-economic factors impact correlate with tax evasion. To help us visualize this, we will use a "World-Map" graph to display the various inputs.

In [15]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

We get a list of country codes from the working example, and use those to draw the interactive country maps:

In [3]:
# Load country codes
df_countries_codes = pd.read_csv('data/countries_codes.csv', low_memory=False).set_index('COUNTRY')

In [4]:
# Load datasets
## Load panama papers datasets
pp_edges = pd.read_csv('data/panama_papers/panama_papers.edges.csv', low_memory=False)
pp_nodes_address = pd.read_csv('data/panama_papers/panama_papers.nodes.address.csv', low_memory=False)
pp_nodes_entity = pd.read_csv('data/panama_papers/panama_papers.nodes.entity.csv', low_memory=False)
pp_nodes_intermediary = pd.read_csv('data/panama_papers/panama_papers.nodes.intermediary.csv', low_memory=False)
pp_nodes_officer = pd.read_csv('data/panama_papers/panama_papers.nodes.officer.csv', low_memory=False)
## Load UN datasets
un_hdi_components_2014 = pd.read_csv('data/un/hdi_components.csv', low_memory=False)
un_gdp_per_capita = pd.read_csv('data/un/gdp_per_capita.csv', low_memory=False)
un_gdp_per_capita_ppp = pd.read_csv('data/un/gdp_per_capita_PPP.csv', low_memory=False)

Let's look at a few of the UN datasets:

#### GDP per Capita

In [6]:
un_gdp_per_capita.head()

Unnamed: 0,Country,Year,Item,Value
0,Afghanistan,2016,Gross Domestic Product (GDP),583.882867
1,Afghanistan,2015,Gross Domestic Product (GDP),610.854517
2,Afghanistan,2014,Gross Domestic Product (GDP),651.158326
3,Afghanistan,2013,Gross Domestic Product (GDP),681.033974
4,Afghanistan,2012,Gross Domestic Product (GDP),694.885886


#### HDI Components (2014)

In [7]:
un_hdi_components_2014.head()

Unnamed: 0,HDI rank,Country,Human Development Index (HDI),Life expectancy at birth,Expected years of schooling,Mean years of schooling,Gross national income (GNI) per capita,GNI per capita rank minus HDI rank
0,1,Norway,0.944,81.6,17.5,12.6,64992,5
1,2,Australia,0.935,82.4,20.2,13.0,42261,17
2,3,Switzerland,0.93,83.0,15.8,12.8,56431,6
3,4,Denmark,0.923,80.2,18.7,12.7,44025,11
4,5,Netherlands,0.922,81.6,17.9,11.9,45435,9


#### Panama papers

In order to display the values on the maps, we need to join the UN datasets with the corresponding country code. We'll start by trying to automate this process, before looking at possible exceptions:

In [8]:
# Join UN datasets with country codes
un_hdi_components_2014 = un_hdi_components_2014.join(df_countries_codes, on='Country')
un_gdp_per_capita = un_gdp_per_capita.join(df_countries_codes, on='Country')
un_gdp_per_capita_ppp = un_gdp_per_capita_ppp.join(df_countries_codes, on='Country')

Remove parts containing paranthesis (for instance, Iran (Islamic Republic of) becomes Iran)

In [11]:


un_dfs = [un_hdi_components_2014, un_gdp_per_capita, un_gdp_per_capita_ppp]
countries = {}

for country in pycountry.countries:
    countries[country.name] = country.alpha_3  

for df in un_dfs:
    nan_values = df['CODE'].isna()
    input_countries = list(df[nan_values]['Country'].values)
        
    codes = []
    for country in input_countries:
        if country in countries:
            codes.append(countries.get(country))
        else:        
            accepted = []
            str_country = str(country)
            # see if string contains either common_name or name of countries
            for p_country in pycountry.countries:
                if p_country.name in str_country or (hasattr(p_country, 'common_name') and p_country.common_name in str_country):
                    accepted.append(p_country.alpha_3)
            if len(accepted) == 1:
                codes.append(accepted[0])
            else:
                codes.append(None)

    df.loc[nan_values, 'CODE'] = codes
    
    # Remove rows that were not found
    df = df[df['CODE'].notnull()]

In [14]:
data = [ dict(
        type = 'choropleth',
        locations = un_hdi_components_2014['CODE'],
        z = un_hdi_components_2014['Human Development Index (HDI)'],
        text = un_hdi_components_2014['Country'],
        colorscale = [[0,"rgb(5, 10, 172)"],[0.35,"rgb(40, 60, 190)"],[0.5,"rgb(70, 100, 245)"],\
            [0.6,"rgb(90, 120, 245)"],[0.7,"rgb(106, 137, 247)"],[1,"rgb(220, 220, 220)"]],
        autocolorscale = False,
        reversescale = True,
        marker = dict(
            line = dict (
                color = 'rgb(180,180,180)',
                width = 0.5
            ) ),
        colorbar = dict(
            autotick = False,
            tickprefix = '%',
            title = 'HDI'),
      ) ]

layout = dict(
    title = 'Test Graph',
    geo = dict(
        showframe = False,
        showcoastlines = False,
        projection = dict(
            type = 'Mercator'
        )
    )
)

fig = dict( data=data, layout=layout )

iplot( fig, validate=False)