# Import and Cleaning

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import imageio

life = pd.read_excel("./data/gapminder_lifeexpectancy.xlsx", index_col=0)
fert = pd.read_csv("./data/gapminder_total_fertility.csv", index_col=0)
pop = pd.read_excel("./data/gapminder_population.xlsx", index_col=0)
cont = pd.read_csv("./data/continents.csv", index_col=0, sep = ";")

In [None]:
#clean and tidy fertility
fert.columns = fert.columns.astype(int)
fert.index.name = "country"
fert = fert.reset_index()
fert = fert.melt(id_vars = "country", var_name = "year", value_name = "Fertility Rate")

#clean and tidy life expectancy
life.index.name = "country"
life = life.reset_index()
life = life.melt(id_vars = "country", var_name = "year", value_name = "Life Expectancy")

#clean and tidy population
pop.index.name = "country"
pop = pop.reset_index()
pop = pop.melt(id_vars = "country", var_name = "year", value_name = "Population")

#clean continents
cont = cont.reset_index() #THIS IS A GAME-CHANGER
cont.rename(columns = {"continent" : "Continent"}, inplace = True)

#merge and tidy dataframes
df = fert.merge(pop)
df = df.merge(life)
df = df.merge(cont)
df = df.sort_values(["Continent", "country"])

df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# Make Scatterplots

In [None]:
continent_names = df["Continent"].unique()

for i in range (1960, 2016): # selected from 1960 for more consistency/stability of data
    df_subset = df.loc[df["year"] == i]
    plt.figure()
    
    minsize = df_subset["Population"].min()
    maxsize = df_subset["Population"].max()
    
    fig, ax = plt.subplots(figsize = (12,6))
    ax.set(xlim = (0, 90), ylim = (0, 10))
    sns.scatterplot(x = "Life Expectancy", y = "Fertility Rate", legend = False, data = df_subset, size = "Population", sizes = (0.001*minsize, 0.000005*maxsize), hue = "Continent", ax = ax, alpha = 0.6).set(title = "Life Expectancy vs. Fertility Rate " + str(i))
    
    legend_handles = []
    legend_labels = []
    for name, color in zip(continent_names, sns.color_palette()):
        legend_handles.append(ax.scatter([], [], s=100, color=color, label=name))
        legend_labels.append(name)
    legend_handles.append(ax.scatter([], [], s=50, color='black'))
    
    ax.legend(handles=legend_handles, labels=legend_labels, fontsize='medium', loc='upper left')
    
    # labeling only countries in top 10 for mean population 
    df_maxpop = df_subset.groupby("country").mean("Population").sort_values(["Population"], ascending = False).head(10)
    df_maxpop.index.name = "country"
    df_maxpop = df_maxpop.reset_index()
    df_maxpop = df_maxpop["country"]
    df_maxpop = df_maxpop.values.tolist()[0:11]
    
    for _, row in df_subset.iterrows():
        if row["country"] in df_maxpop:
            ax.text(row["Life Expectancy"]-3, row["Fertility Rate"], row["country"],)

    fig.savefig("./figs/fig"+str(i)+".png")

# Make GIF

In [None]:
images = []

for i in range(1960, 2016):
    filename = "figs/fig{}.png".format(i)
    images.append(imageio.imread(filename))

imageio.mimsave("output.gif", images, fps=10)