# Applied Data Science
## Data transformation and integration
### Tom Diethe

In [None]:
import pandas as pd
%matplotlib inline
from StringIO import StringIO
from zipfile import ZipFile
from urllib import urlopen
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from IPython import display
from time import sleep

sns.set_context("talk", font_scale=2, rc={"lines.linewidth": 4})

In [None]:
country = 'Country Name'
countries = ['United States', 'China', 'India']

def get_worldbank_dataframe(indicator_id):
    url = urlopen('http://api.worldbank.org/v2/en/indicator/{0}?downloadformat=csv'.format(indicator_id))
    archive = ZipFile(StringIO(url.read()))
    print(archive.namelist())
    fname = [f for f in archive.namelist() if not f.startswith('Metadata')][0]
    data = StringIO(archive.read(fname))
    return pd.read_csv(data, skiprows=4)

def rearrange_dataframe(df, indicator_name):
    years = [c for c in df.columns if c[0] == '1' or c[0] == '2']
    # print(years)
    indicator = 'Life expectancy at birth'

    df = pd.melt(df[[country] + years], id_vars=country, var_name='year')
    df.rename(columns={'value': indicator_name}, inplace=True)
    df.set_index(['year', country], inplace=True)
    return df

def plot_indicator(df, indicator_name):
    fig, ax = plt.subplots(figsize=[15 ,10])
    for label, dfi in df.groupby(level=1):
        dfi[indicator].plot(ax=ax, label=label)
    plt.legend()
    ax.set_ylabel(indicator)
    ax.set_xticklabels(df1c.index.levels[0].values)
    ax.set_xlabel('year')

# Life expectancy at birth, total (years)
## Data from: http://data.worldbank.org/indicator/SP.DYN.LE00.IN

In [None]:
df1 = get_worldbank_dataframe('SP.DYN.LE00.IN')
df1.head()

In [None]:
indicator = 'Life expectency at birth'
df1r = rearrange_dataframe(df1, indicator)
df1r.sort_index(ascending=True, inplace=True)
df1c = df1r.loc[(slice(None), countries),:]
df1c.head()

In [None]:
plot_indicator(df1c, indicator)

# Total Population
## Data from: http://data.worldbank.org/indicator/SP.POP.TOTL

In [None]:
df2 = get_worldbank_dataframe('SP.POP.TOTL')
df2.head()

In [None]:
indicator = 'Total population'
df2r = rearrange_dataframe(df2, indicator)
df2r.sort_index(ascending=True, inplace=True)
df2c = df2r.loc[(slice(None), countries),:]
df2c.head()

In [None]:
plot_indicator(df2c, indicator)

# GDP per Capita
Data from: http://api.worldbank.org/v2/en/indicator/NY.GDP.PCAP.CD?downloadformat=csv

In [None]:
df3 = get_worldbank_dataframe('NY.GDP.PCAP.CD')
df3.head()

In [None]:
indicator = 'GDP per capita'
df3r = rearrange_dataframe(df3, indicator)
df3r.sort_index(ascending=True, inplace=True)
df3c = df3r.loc[(slice(None), countries),:]
df3c.head()

In [None]:
plot_indicator(df3c, indicator)

In [None]:
df4 = df1c.merge(df2c, left_index=True, right_index=True).merge(df3c, left_index=True, right_index=True)
df4.head()

In [None]:
x = 'GDP per capita'
y = 'Life expectency at birth'
s = 'Total population'

fig, ax = plt.subplots(figsize=[15 ,10])
for c in countries:
    g = df4.loc['1980', c]
    ax.plot(g[x], g[y], marker='o', linestyle='', ms=g[s] / 1e7, label=c)
lgnd = ax.legend()
for i in range(len(countries)):
    lgnd.legendHandles[i]._legmarker.set_markersize(20)
ax.set_xlabel(x)
ax.set_ylabel(y)

ax.set_xlim([0, df4[x].max()])
ax.set_ylim([df4[y].min()*0.9, df4[y].max()*1.1])
ax.set_title('1980')

In [None]:
def wb_scatter(df, year):
    current_palette = sns.color_palette()
    rng = float(df.index.levels[0].max()) - float(df.index.levels[0].min())
    cur = float(year) - float(df.index.levels[0].min())
    alpha = (cur / rng) / 2
    for i, c in enumerate(countries):
        g = df.loc[year, c]
        ax.plot(g[x], g[y], marker='o', linestyle='', color=current_palette[i], ms=g[s] / 1e7, label=c, alpha=alpha)
    ax.set_xlabel(x)
    ax.set_ylabel(y)
    ax.set_xlim([0, df[x].max()])
    ax.set_ylim([df[y].min()*0.9, df[y].max()*1.1])
    ax.set_title(year)

# Animated version
fig, ax = plt.subplots(figsize=[15 ,10])

do_legend = True

for year in df4.index.levels[0].values:
    wb_scatter(df4, year)

    if do_legend:
        lgnd = ax.legend()
        for i in range(len(countries)):
            lgnd.legendHandles[i]._legmarker.set_markersize(20)
            lgnd.legendHandles[i]._legmarker.set_alpha(1.0)
        do_legend = False

    display.clear_output(wait=True)
    display.display(plt.gcf())
    sleep(0.05)

## Some other things to try:
* Different sets of countries
* Group countries into continents
* Curve fitting (e.g. using Gaussian Processes) to the trajectories
* Combine with other worldbank datasets
* Combine with other external datasets