### Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)
pd.options.display.max_columns = None

### Understanding the Data
In this exercise we will reproduce Hans Rosling’s famous animation using the steps outlined below
1. We are going to read in 3 datasets obtained from http://www.gapminder.org/data :
    * worldwide fertility rate data
    * worldwide life expectancy data
    * worldwide population data


2. Merge all the data into a single DataFrame.

3. Generate a scatterplot for a single year.

4. Combine images of scatterplots of all years to make an animation / gif.

In [2]:
# Read files
fert = pd.read_csv("../data/gapminder_total_fertility.csv", index_col=0)
population = pd.read_excel("../data/gapminder_population.xlsx", index_col = 0, engine='openpyxl')
life = pd.read_excel("../data/gapminder_lifeexpectancy.xlsx", index_col=0, engine='openpyxl')

### Preprocessing the Data
* Transforming Dataframe
* Handling NaNs

In [3]:
def transform_data(df1,df2,df3):
    # Transform columns with dtype string to integer
    df1.columns = df1.columns.astype(int)

    # Rename index to 'country'
    df1.index.name = 'country'
    df2.index.name = 'country'
    df3.index.name = 'country'

    # Reset the index
    df1 = df1.reset_index()
    df2 = df2.reset_index()
    df3 = df3.reset_index()

    # Convert the tables into long format, melt & merge
    df1 = df1.melt(id_vars='country', var_name='year', value_name='fertility_rate')
    df2 = df2.melt(id_vars='country', var_name='year', value_name='total_population')
    df3 = df3.melt(id_vars='country', var_name='year', value_name='life_expectancy')

    df = df1.merge(df2).merge(df3)
    df = df[ df.year >= 1950 ]
    df.reset_index(drop=True, inplace=True)

    return df

df = transform_data(fert,population,life)

In [4]:
def missing_data(df):
    
    # Options for filling NaNs(drop, Imputation using interpolation, ffill, bfill, mean, median, mode)
    df = df.dropna()    
    return df

df = missing_data(df)