# 01. Visual Data Analysis - Animated Scatterplot

The goal of this project to create an animated scatterplot to visualize the change of countries' fertility rate, life expectancy and population.  

In [1]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
import os
import imageio

## 1. Loading data

In [2]:
# 1. Load table with fertility rate data
fert = pd.read_csv("data/gapminder_total_fertility.csv", index_col=0)

In [3]:
fert.head()

Unnamed: 0_level_0,1800,1801,1802,1803,1804,1805,1806,1807,1808,1809,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
Total fertility rate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abkhazia,,,,,,,,,,,...,,,,,,,,,,
Afghanistan,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,...,6.7,6.46,6.2,5.93,5.66,5.4,5.14,4.9,4.68,4.47
Akrotiri and Dhekelia,,,,,,,,,,,...,,,,,,,,,,
Albania,4.6,4.6,4.6,4.6,4.6,4.6,4.6,4.6,4.6,4.6,...,1.85,1.8,1.76,1.74,1.74,1.75,1.76,1.77,1.78,1.78
Algeria,6.99,6.99,6.99,6.99,6.99,6.99,6.99,6.99,6.99,6.99,...,2.58,2.66,2.73,2.78,2.82,2.83,2.82,2.8,2.76,2.71


In [4]:
# 2. Load table with life expectancy data 
life = pd.read_excel("data/gapminder_lifeexpectancy.xlsx", index_col=0)

In [5]:
life.head()

Unnamed: 0_level_0,1800,1801,1802,1803,1804,1805,1806,1807,1808,1809,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
Life expectancy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abkhazia,,,,,,,,,,,...,,,,,,,,,,
Afghanistan,28.21,28.2,28.19,28.18,28.17,28.16,28.15,28.14,28.13,28.12,...,52.4,52.8,53.3,53.6,54.0,54.4,54.8,54.9,53.8,52.72
Akrotiri and Dhekelia,,,,,,,,,,,...,,,,,,,,,,
Albania,35.4,35.4,35.4,35.4,35.4,35.4,35.4,35.4,35.4,35.4,...,76.6,76.8,77.0,77.2,77.4,77.5,77.7,77.9,78.0,78.1
Algeria,28.82,28.82,28.82,28.82,28.82,28.82,28.82,28.82,28.82,28.82,...,75.3,75.5,75.7,76.0,76.1,76.2,76.3,76.3,76.4,76.5


In [6]:
# 3. Load table with population data 
pop = pd.read_excel("data/gapminder_population.xlsx", index_col=0)

In [7]:
pop.head()

Unnamed: 0_level_0,1800,1810,1820,1830,1840,1850,1860,1870,1880,1890,...,Unnamed: 82,Unnamed: 83,Unnamed: 84,Unnamed: 85,Unnamed: 86,Unnamed: 87,Unnamed: 88,Unnamed: 89,Unnamed: 90,Unnamed: 91
Total population,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abkhazia,,,,,,,,,,,...,,,,,,,,,,
Afghanistan,3280000.0,3280000.0,3323519.0,3448982.0,3625022.0,3810047.0,3973968.0,4169690.0,4419695.0,4710171.0,...,,,,,,,,,,
Akrotiri and Dhekelia,,,,,,,,,,,...,,,,,,,,,,
Albania,410445.0,423591.0,438671.0,457234.0,478227.0,506889.0,552800.0,610036.0,672544.0,741688.0,...,,,,,,,,,,
Algeria,2503218.0,2595056.0,2713079.0,2880355.0,3082721.0,3299305.0,3536468.0,3811028.0,4143163.0,4525691.0,...,,,,,,,,,,


In [8]:
# 4. Load table with continent column
continent = pd.read_csv("data/continents.csv", sep = ';')

In [9]:
continent.head()

Unnamed: 0,continent,country
0,Africa,Algeria
1,Africa,Angola
2,Africa,Benin
3,Africa,Botswana
4,Africa,Burkina


## 2. Data Wrangling 

In [10]:
# the shape of both tables:
print(life.shape)
print(fert.shape)
print(pop.shape)

(260, 217)
(260, 216)
(275, 91)


### 2.1. Convert table columns into the same variable format

In [11]:
# the columns of tables:
fert.columns

Index(['1800', '1801', '1802', '1803', '1804', '1805', '1806', '1807', '1808',
       '1809',
       ...
       '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014',
       '2015'],
      dtype='object', length=216)

In [12]:
life.columns

Int64Index([1800, 1801, 1802, 1803, 1804, 1805, 1806, 1807, 1808, 1809,
            ...
            2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016],
           dtype='int64', length=217)

In [13]:
fert.columns

Index(['1800', '1801', '1802', '1803', '1804', '1805', '1806', '1807', '1808',
       '1809',
       ...
       '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014',
       '2015'],
      dtype='object', length=216)

In [14]:
# Two tables has columns as strings, the other has integer numbers. 
# To merge the tables, we need to convert them into the same format:
fert.columns = fert.columns.astype(int)
fert.columns

Int64Index([1800, 1801, 1802, 1803, 1804, 1805, 1806, 1807, 1808, 1809,
            ...
            2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015],
           dtype='int64', length=216)

In [15]:
pop.columns

Index([         1800,          1810,          1820,          1830,
                1840,          1850,          1860,          1870,
                1880,          1890,          1900,          1910,
                1920,          1930,          1940,          1950,
                1951,          1952,          1953,          1954,
                1955,          1956,          1957,          1958,
                1959,          1960,          1961,          1962,
                1963,          1964,          1965,          1966,
                1967,          1968,          1969,          1970,
                1971,          1972,          1973,          1974,
                1975,          1976,          1977,          1978,
                1979,          1980,          1981,          1982,
                1983,          1984,          1985,          1986,
                1987,          1988,          1989,          1990,
                1991,          1992,          1993,          1

In [16]:
# we also drop these columns from population table, which appeared because of opening the file in excel:
pop.drop(columns=['Unnamed: 82', 'Unnamed: 83', 'Unnamed: 84',
       'Unnamed: 85', 'Unnamed: 86', 'Unnamed: 87', 'Unnamed: 88',
       'Unnamed: 89', 'Unnamed: 90', 'Unnamed: 91'], axis=1, inplace=True)

In [17]:
pop.columns = pop.columns.astype(int)

In [18]:
pop.columns

Int64Index([1800, 1810, 1820, 1830, 1840, 1850, 1860, 1870, 1880, 1890, 1900,
            1910, 1920, 1930, 1940, 1950, 1951, 1952, 1953, 1954, 1955, 1956,
            1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967,
            1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978,
            1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989,
            1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
            2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011,
            2012, 2013, 2014, 2015],
           dtype='int64')

### 2.2. Changing the index name of the table

In [19]:
# checking the index name of the table:
fert.index

Index(['Abkhazia', 'Afghanistan', 'Akrotiri and Dhekelia', 'Albania',
       'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Anguilla',
       'Antigua and Barbuda',
       ...
       'Vietnam', 'Virgin Islands (U.S.)', 'North Yemen (former)',
       'South Yemen (former)', 'Yemen', 'Yugoslavia', 'Zambia', 'Zimbabwe',
       'Åland', 'Åland'],
      dtype='object', name='Total fertility rate', length=260)

In [20]:
# 1. changing the name of the index to country in the fertility rate table.
fert.index.name = 'country'

In [21]:
fert.index

Index(['Abkhazia', 'Afghanistan', 'Akrotiri and Dhekelia', 'Albania',
       'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Anguilla',
       'Antigua and Barbuda',
       ...
       'Vietnam', 'Virgin Islands (U.S.)', 'North Yemen (former)',
       'South Yemen (former)', 'Yemen', 'Yugoslavia', 'Zambia', 'Zimbabwe',
       'Åland', 'Åland'],
      dtype='object', name='country', length=260)

In [22]:
fert.head()

Unnamed: 0_level_0,1800,1801,1802,1803,1804,1805,1806,1807,1808,1809,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abkhazia,,,,,,,,,,,...,,,,,,,,,,
Afghanistan,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,7.0,...,6.7,6.46,6.2,5.93,5.66,5.4,5.14,4.9,4.68,4.47
Akrotiri and Dhekelia,,,,,,,,,,,...,,,,,,,,,,
Albania,4.6,4.6,4.6,4.6,4.6,4.6,4.6,4.6,4.6,4.6,...,1.85,1.8,1.76,1.74,1.74,1.75,1.76,1.77,1.78,1.78
Algeria,6.99,6.99,6.99,6.99,6.99,6.99,6.99,6.99,6.99,6.99,...,2.58,2.66,2.73,2.78,2.82,2.83,2.82,2.8,2.76,2.71


In [23]:
# 2. changing the name of the index to country in the life expectancy table.
life.index.name = 'country'

In [24]:
life.head()

Unnamed: 0_level_0,1800,1801,1802,1803,1804,1805,1806,1807,1808,1809,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abkhazia,,,,,,,,,,,...,,,,,,,,,,
Afghanistan,28.21,28.2,28.19,28.18,28.17,28.16,28.15,28.14,28.13,28.12,...,52.4,52.8,53.3,53.6,54.0,54.4,54.8,54.9,53.8,52.72
Akrotiri and Dhekelia,,,,,,,,,,,...,,,,,,,,,,
Albania,35.4,35.4,35.4,35.4,35.4,35.4,35.4,35.4,35.4,35.4,...,76.6,76.8,77.0,77.2,77.4,77.5,77.7,77.9,78.0,78.1
Algeria,28.82,28.82,28.82,28.82,28.82,28.82,28.82,28.82,28.82,28.82,...,75.3,75.5,75.7,76.0,76.1,76.2,76.3,76.3,76.4,76.5


In [25]:
# 2. changing the name of the index to country in the population table.
pop.index.name = 'country'

In [26]:
pop.head()

Unnamed: 0_level_0,1800,1810,1820,1830,1840,1850,1860,1870,1880,1890,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abkhazia,,,,,,,,,,,...,,,,,,,,,,
Afghanistan,3280000.0,3280000.0,3323519.0,3448982.0,3625022.0,3810047.0,3973968.0,4169690.0,4419695.0,4710171.0,...,25183615.0,25877544.0,26528741.0,27207291.0,27962207.0,28809167.0,29726803.0,30682500.0,31627506.0,32526562.0
Akrotiri and Dhekelia,,,,,,,,,,,...,15700.0,15700.0,15700.0,,,,,,,
Albania,410445.0,423591.0,438671.0,457234.0,478227.0,506889.0,552800.0,610036.0,672544.0,741688.0,...,3050741.0,3010849.0,2968026.0,2929886.0,2901883.0,2886010.0,2880667.0,2883281.0,2889676.0,2896679.0
Algeria,2503218.0,2595056.0,2713079.0,2880355.0,3082721.0,3299305.0,3536468.0,3811028.0,4143163.0,4525691.0,...,33749328.0,34261971.0,34811059.0,35401790.0,36036159.0,36717132.0,37439427.0,38186135.0,38934334.0,39666519.0


### 2.3. Coverting the tables into long format

In [27]:
# First, moving the index into a column:

fert = fert.reset_index()

In [28]:
# Second, we use melt function for the conversion:

fert = fert.melt(id_vars='country', var_name='year', value_name='fertility_rate')

In [29]:
fert.head()

Unnamed: 0,country,year,fertility_rate
0,Abkhazia,1800,
1,Afghanistan,1800,7.0
2,Akrotiri and Dhekelia,1800,
3,Albania,1800,4.6
4,Algeria,1800,6.99


In [30]:
life = life.reset_index()

In [31]:
life = life.melt(id_vars='country', var_name='year', value_name='life-expectancy')

In [32]:
life.head()

Unnamed: 0,country,year,life-expectancy
0,Abkhazia,1800,
1,Afghanistan,1800,28.21
2,Akrotiri and Dhekelia,1800,
3,Albania,1800,35.4
4,Algeria,1800,28.82


In [33]:
pop = pop.reset_index()

In [34]:
pop = pop.melt(id_vars='country', var_name='year', value_name='population')

In [35]:
pop.head()

Unnamed: 0,country,year,population
0,Abkhazia,1800,
1,Afghanistan,1800,3280000.0
2,Akrotiri and Dhekelia,1800,
3,Albania,1800,410445.0
4,Algeria,1800,2503218.0


### 2.4. Merging tables

In [36]:
# merging fertility rate table with population table
df = fert.merge(pop)

In [37]:
df

Unnamed: 0,country,year,fertility_rate,population
0,Abkhazia,1800,,
1,Afghanistan,1800,7.00,3280000.0
2,Akrotiri and Dhekelia,1800,,
3,Albania,1800,4.60,410445.0
4,Algeria,1800,6.99,2503218.0
...,...,...,...,...
20974,Yugoslavia,2015,,
20975,Zambia,2015,5.59,16211767.0
20976,Zimbabwe,2015,3.35,15602751.0
20977,Åland,2015,,


In [38]:
# Merging the life expectancy table 
df = df.merge(life)

In [39]:
df

Unnamed: 0,country,year,fertility_rate,population,life-expectancy
0,Abkhazia,1800,,,
1,Afghanistan,1800,7.00,3280000.0,28.21
2,Akrotiri and Dhekelia,1800,,,
3,Albania,1800,4.60,410445.0,35.40
4,Algeria,1800,6.99,2503218.0,28.82
...,...,...,...,...,...
20974,Yugoslavia,2015,,,
20975,Zambia,2015,5.59,16211767.0,56.70
20976,Zimbabwe,2015,3.35,15602751.0,59.30
20977,Åland,2015,,,


In [40]:
# Merging the continent table 
df = df.merge(continent)

In [41]:
df

Unnamed: 0,country,year,fertility_rate,population,life-expectancy,continent
0,Afghanistan,1800,7.00,3280000.0,28.21,Asia
1,Afghanistan,1810,7.00,3280000.0,28.11,Asia
2,Afghanistan,1820,7.00,3323519.0,28.01,Asia
3,Afghanistan,1830,7.00,3448982.0,27.90,Asia
4,Afghanistan,1840,7.00,3625022.0,27.80,Asia
...,...,...,...,...,...,...
14170,Zimbabwe,2011,3.64,14255592.0,51.60,Africa
14171,Zimbabwe,2012,3.56,14565482.0,54.20,Africa
14172,Zimbabwe,2013,3.49,14898092.0,55.70,Africa
14173,Zimbabwe,2014,3.41,15245855.0,57.00,Africa


## 3. Creating the animated scatterplot

### 3.1. Creating the scatterplot for 1 selected year

In [42]:
# filtering the dataset for 1 year
year = 1960
df_2015 = df.loc[df['year'] == year]

In [43]:
# creating the scatterplot for 1 selected year
fig = px.scatter(df_2015,
                x=df_2015['fertility_rate'], 
                y=df_2015['life-expectancy'], 
                color=df_2015['continent'],    
                size=df_2015['population'],
                size_max=120,
                #text="country",
                hover_data={"continent": False},
                title=f'Population of countries in {year}',
                custom_data=['country','population'],
                height=800,
                width=1000
                )
fig.update_layout(xaxis_title='Fertility Rate', yaxis_title='Life Expectancy')
fig.update_traces(textposition='top center', marker_sizemin = 5, hovertemplate = 'Country: %{customdata[0]} <br>Population: %{customdata[1]:,} <br>Fertility rate: %{x} <br>Life expectancy: %{y:.1f}<extra></extra>' )
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
), legend_title_text='')
fig.update_layout(legend={'itemsizing': 'constant'}) 
fig.update_layout(yaxis_range=[20,90], xaxis_range=[-0.5,10])
#fig.update_traces(textposition='middle right')
#fig.write_html(f"lifeexp_{year}.html")
#fig.write_image(f'images/lifeexp_{year}.png', scale=1)
fig.show()

### 3.2. loop for creating images for all years

In [44]:
# loop for creating images for all years between 1960 and 2015
for year in range(1960, 2016):
    df_subset = df.loc[df['year'] == year]
    fig = px.scatter(df_subset,
                x=df_subset['fertility_rate'], 
                y=df_subset['life-expectancy'], 
                color=df_subset['continent'],    
                size=df_subset['population'],
                size_max=120,
                #text="country",
                hover_data={"continent": False},
                title=f'Population of countries in {year}',
                custom_data=['country','population'],
                height=800,
                width=1100
                )
    fig.update_layout(xaxis_title='Fertility Rate', yaxis_title='Life Expectancy')
    fig.update_traces(marker_sizemin = 5, hovertemplate = 'Country: %{customdata[0]} <br>Population: %{customdata[1]:,} <br>Fertility rate: %{x} <br>Life expectancy: %{y:.1f}<extra></extra>' )
    fig.update_layout(legend=dict(
                                orientation="h",
                                yanchor="bottom",
                                y=1.02,
                                xanchor="right",
                                x=1
                                ), 
                      legend_title_text='')
    fig.update_layout(legend={'itemsizing': 'constant'})
    fig.update_layout(yaxis_range= [20,90], xaxis_range= [-0.5,10])
    #fig.update_traces(textposition='middle right')
    fig.write_image(f'images/lifeexp_{year}.png', scale=1)

### 3.3. Generating a gif animation

In [45]:
# generating a gif file from the individual images of years

images = []

for year in range(1960, 2016):
    filename = f'images/lifeexp_{year}.png'
    images.append(imageio.imread(filename))

imageio.mimsave('images/animation.gif', images, fps=12)

![output](images/animation.gif "output")