In [164]:
import pandas as pd
import plotly.express as px

In [165]:
df = pd.read_csv("gapminder_after1952.csv")

In [166]:
df.head(20)

Unnamed: 0,country,continent,year,lifeExp,population,gdpPercap,iso_alpha,iso_num
0,Afghanistan,Asia,1952,28.801,8425333,779.445314,AFG,4
1,Afghanistan,Asia,1957,30.332,9240934,820.85303,AFG,4
2,Afghanistan,Asia,1962,31.997,10267083,853.10071,AFG,4
3,Afghanistan,Asia,1967,34.02,11537966,836.197138,AFG,4
4,Afghanistan,Asia,1972,36.088,13079460,739.981106,AFG,4
5,Afghanistan,Asia,1977,38.438,14880372,786.11336,AFG,4
6,Afghanistan,Asia,1982,39.854,12881816,978.011439,AFG,4
7,Afghanistan,Asia,1987,40.822,13867957,852.395945,AFG,4
8,Afghanistan,Asia,1992,41.674,16317921,649.341395,AFG,4
9,Afghanistan,Asia,1997,41.763,22227415,635.341351,AFG,4


In [167]:
df.shape

(1692, 8)

In [168]:
df.country.unique().shape

(141,)

In [169]:
df.year.unique()

array([1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, 2002,
       2007])

In [170]:
df.year.unique().shape

(12,)

In [171]:
141 * 12

1692

In [172]:
df.shape

(1692, 8)

In [173]:
#check for null values 

In [174]:
df.isnull().sum()

country       0
continent     0
year          0
lifeExp       0
population    0
gdpPercap     0
iso_alpha     0
iso_num       0
dtype: int64

### Let's create database for each country 

In [175]:
countries = df[['country', 'iso_alpha', 'continent']].drop_duplicates()

In [176]:
countries.head()


Unnamed: 0,country,iso_alpha,continent
0,Afghanistan,AFG,Asia
12,Albania,ALB,Europe
24,Algeria,DZA,Africa
36,Angola,AGO,Africa
48,Argentina,ARG,Americas


In [177]:
countries.shape

(141, 3)

### EDA on the dataset is always informative  

In [178]:
fig = px.histogram(df, x="lifeExp", histnorm='percent', 
                  title="Life Expectancy")
fig.show()

In [179]:
fig = px.histogram(df, x="population", histnorm='percent', 
                  title="Population")
fig.show()

In [180]:
#are there values equal to zero?? 

In [181]:
#outliers? error values? 

In [182]:
df[df.population == 0].shape

(0, 8)

### Let's visualize the trajectories of a two countries

Create a temporary dataset that contains only two countries

In [183]:
tmp = df.query("country in ['United States', 'India']")

In [184]:
tmp

Unnamed: 0,country,continent,year,lifeExp,population,gdpPercap,iso_alpha,iso_num
696,India,Asia,1952,37.373,372000000,546.565749,IND,356
697,India,Asia,1957,40.249,409000000,590.061996,IND,356
698,India,Asia,1962,43.605,454000000,658.347151,IND,356
699,India,Asia,1967,47.193,506000000,700.770611,IND,356
700,India,Asia,1972,50.651,567000000,724.032527,IND,356
701,India,Asia,1977,54.208,634000000,813.337323,IND,356
702,India,Asia,1982,56.596,708000000,855.723538,IND,356
703,India,Asia,1987,58.553,788000000,976.512676,IND,356
704,India,Asia,1992,60.223,872000000,1164.406809,IND,356
705,India,Asia,1997,61.765,959000000,1458.817442,IND,356


One should improve the chart below by providing better labels and titles

In [185]:
fig = px.line(tmp, x="population", y="lifeExp", color="country", text="year")

fig.update_traces(textposition="bottom right")
fig.show()

## Prepare the dataset for HD visualization and cluster Analyses 

Each country should be a line. Columns should be income and life expectancy 

In [186]:
df.head()

Unnamed: 0,country,continent,year,lifeExp,population,gdpPercap,iso_alpha,iso_num
0,Afghanistan,Asia,1952,28.801,8425333,779.445314,AFG,4
1,Afghanistan,Asia,1957,30.332,9240934,820.85303,AFG,4
2,Afghanistan,Asia,1962,31.997,10267083,853.10071,AFG,4
3,Afghanistan,Asia,1967,34.02,11537966,836.197138,AFG,4
4,Afghanistan,Asia,1972,36.088,13079460,739.981106,AFG,4


In [187]:
lifeExp = df[['country', 'year', 'lifeExp']].set_index(['country', 'year']).unstack()
lifeExp

Unnamed: 0_level_0,lifeExp,lifeExp,lifeExp,lifeExp,lifeExp,lifeExp,lifeExp,lifeExp,lifeExp,lifeExp,lifeExp,lifeExp
year,1952,1957,1962,1967,1972,1977,1982,1987,1992,1997,2002,2007
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Afghanistan,28.801,30.332,31.997,34.020,36.088,38.438,39.854,40.822,41.674,41.763,42.129,43.828
Albania,55.230,59.280,64.820,66.220,67.690,68.930,70.420,72.000,71.581,72.950,75.651,76.423
Algeria,43.077,45.685,48.303,51.407,54.518,58.014,61.368,65.799,67.744,69.152,70.994,72.301
Angola,30.015,31.999,34.000,35.985,37.928,39.483,39.942,39.906,40.647,40.963,41.003,42.731
Argentina,62.485,64.399,65.142,65.634,67.065,68.481,69.942,70.774,71.868,73.275,74.340,75.320
...,...,...,...,...,...,...,...,...,...,...,...,...
Vietnam,40.412,42.887,45.363,47.838,50.254,55.764,58.816,62.820,67.662,70.672,73.017,74.249
West Bank and Gaza,43.160,45.671,48.127,51.631,56.532,60.765,64.406,67.046,69.718,71.096,72.370,73.422
"Yemen, Rep.",32.548,33.970,35.180,36.984,39.848,44.175,49.113,52.922,55.599,58.020,60.308,62.698
Zambia,42.038,44.077,46.023,47.768,50.107,51.386,51.821,50.821,46.100,40.238,39.193,42.384


In [188]:
population = df[['country', 'year', 'population']].set_index(['country', 'year']).unstack()
gdpPercap

Unnamed: 0_level_0,population,population,population,population,population,population,population,population,population,population,population,population
year,1952,1957,1962,1967,1972,1977,1982,1987,1992,1997,2002,2007
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Afghanistan,8425333,9240934,10267083,11537966,13079460,14880372,12881816,13867957,16317921,22227415,25268405,31889923
Albania,1282697,1476505,1728137,1984060,2263554,2509048,2780097,3075321,3326498,3428038,3508512,3600523
Algeria,9279525,10270856,11000948,12760499,14760787,17152804,20033753,23254956,26298373,29072015,31287142,33333216
Angola,4232095,4561361,4826015,5247469,5894858,6162675,7016384,7874230,8735988,9875024,10866106,12420476
Argentina,17876956,19610538,21283783,22934225,24779799,26983828,29341374,31620918,33958947,36203463,38331121,40301927
...,...,...,...,...,...,...,...,...,...,...,...,...
Vietnam,26246839,28998543,33796140,39463910,44655014,50533506,56142181,62826491,69940728,76048996,80908147,85262356
West Bank and Gaza,1030585,1070439,1133134,1142636,1089572,1261091,1425876,1691210,2104779,2826046,3389578,4018332
"Yemen, Rep.",4963829,5498090,6120081,6740785,7407075,8403990,9657618,11219340,13367997,15826497,18701257,22211743
Zambia,2672000,3016000,3421000,3900000,4506497,5216550,6100407,7272406,8381163,9417789,10595811,11746035


Create a new dataset that that concatenates the life expectancy and GDP Percentage Cap 

In [189]:
df2 = pd.concat([lifeExp,population], axis=1) 

In [190]:
df2.head()

Unnamed: 0_level_0,lifeExp,lifeExp,lifeExp,lifeExp,lifeExp,lifeExp,lifeExp,lifeExp,lifeExp,lifeExp,...,population,population,population,population,population,population,population,population,population,population
year,1952,1957,1962,1967,1972,1977,1982,1987,1992,1997,...,1962,1967,1972,1977,1982,1987,1992,1997,2002,2007
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Afghanistan,28.801,30.332,31.997,34.02,36.088,38.438,39.854,40.822,41.674,41.763,...,10267083,11537966,13079460,14880372,12881816,13867957,16317921,22227415,25268405,31889923
Albania,55.23,59.28,64.82,66.22,67.69,68.93,70.42,72.0,71.581,72.95,...,1728137,1984060,2263554,2509048,2780097,3075321,3326498,3428038,3508512,3600523
Algeria,43.077,45.685,48.303,51.407,54.518,58.014,61.368,65.799,67.744,69.152,...,11000948,12760499,14760787,17152804,20033753,23254956,26298373,29072015,31287142,33333216
Angola,30.015,31.999,34.0,35.985,37.928,39.483,39.942,39.906,40.647,40.963,...,4826015,5247469,5894858,6162675,7016384,7874230,8735988,9875024,10866106,12420476
Argentina,62.485,64.399,65.142,65.634,67.065,68.481,69.942,70.774,71.868,73.275,...,21283783,22934225,24779799,26983828,29341374,31620918,33958947,36203463,38331121,40301927


### The new dataset has variables with different scales. It needs to be standardized or normalized. 

In [191]:
df2.shape 

(141, 24)

In [192]:
from sklearn.preprocessing import StandardScaler

In [193]:
df2_scaled =  StandardScaler().fit(df2).transform(df2)

df2_scaled is just an array. Let's put in a dataframe

In [194]:
df2_scaled = pd.DataFrame(df2_scaled, columns=df2.columns, index=df2.index)

In [195]:
df2_scaled.head()

Unnamed: 0_level_0,lifeExp,lifeExp,lifeExp,lifeExp,lifeExp,lifeExp,lifeExp,lifeExp,lifeExp,lifeExp,...,population,population,population,population,population,population,population,population,population,population
year,1952,1957,1962,1967,1972,1977,1982,1987,1992,1997,...,1962,1967,1972,1977,1982,1987,1992,1997,2002,2007
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Afghanistan,-1.654754,-1.729224,-1.784572,-1.84659,-1.893195,-1.881147,-2.012301,-2.121891,-2.00276,-2.011336,...,-0.147578,-0.143924,-0.138601,-0.133232,-0.166832,-0.169025,-0.160026,-0.126518,-0.116954,-0.084197
Albania,0.509149,0.639903,0.931813,0.906854,0.891091,0.842129,0.833991,0.843049,0.670225,0.695627,...,-0.269968,-0.265858,-0.260645,-0.260176,-0.262974,-0.263098,-0.264402,-0.267463,-0.271489,-0.275886
Algeria,-0.485891,-0.472722,-0.435111,-0.359816,-0.269425,-0.132792,-0.008927,0.253351,0.327287,0.365969,...,-0.137059,-0.128322,-0.119629,-0.109915,-0.098764,-0.087204,-0.079842,-0.075201,-0.07421,-0.074417
Angola,-1.555356,-1.592795,-1.618806,-1.678561,-1.731083,-1.787817,-2.004106,-2.209,-2.09455,-2.080775,...,-0.225566,-0.224208,-0.21967,-0.222686,-0.222655,-0.221268,-0.220941,-0.219128,-0.219237,-0.216122
Argentina,1.10316,1.058846,0.958461,0.856745,0.836025,0.802028,0.78948,0.72646,0.695876,0.723837,...,0.010327,0.001523,-0.006577,-0.009037,-0.010179,-0.014282,-0.018295,-0.021734,-0.024185,-0.027197


In [196]:
df2[df2_scaled.columns[0]]

country
Afghanistan           28.801
Albania               55.230
Algeria               43.077
Angola                30.015
Argentina             62.485
                       ...  
Vietnam               40.412
West Bank and Gaza    43.160
Yemen, Rep.           32.548
Zambia                42.038
Zimbabwe              48.451
Name: (lifeExp, 1952), Length: 141, dtype: float64

In [197]:
df2_scaled[df2_scaled.columns[0]]

country
Afghanistan          -1.654754
Albania               0.509149
Algeria              -0.485891
Angola               -1.555356
Argentina             1.103160
                        ...   
Vietnam              -0.704091
West Bank and Gaza   -0.479095
Yemen, Rep.          -1.347964
Zambia               -0.570960
Zimbabwe             -0.045889
Name: (lifeExp, 1952), Length: 141, dtype: float64

In [198]:
df2_scaled[df2_scaled.columns[0]].std()

1.0035650736961987

#### It is a good idea to know how many PCA components you need to describe most of the variability of the dataset but it is not necessary. 

In [199]:
from sklearn.decomposition import PCA

In [200]:
pca = PCA(n_components = 2).fit(df2_scaled)

In [201]:
pca.explained_variance_ratio_.sum()

0.9607575744280017

### Project the data using MDS

In [202]:
from sklearn.manifold import MDS

In [203]:
coords_MDS = MDS(n_components=2).fit_transform(df2_scaled)

In [204]:
coords_MDS

array([[ 2.74763182e+00,  6.05488912e+00],
       [-7.09703239e-01, -2.66223814e+00],
       [ 1.12195057e+00, -3.53110217e-01],
       [ 2.53680228e+00,  6.05150697e+00],
       [-1.45009720e+00, -2.50185134e+00],
       [-1.81965377e+00, -4.22547723e+00],
       [-1.46180730e+00, -3.87896715e+00],
       [ 3.94443173e-02, -2.03977477e+00],
       [ 3.87391260e+00,  1.46561250e+00],
       [-1.66711904e+00, -3.97677909e+00],
       [ 1.60157019e+00,  2.82076072e+00],
       [ 1.88122648e+00,  1.47535542e+00],
       [-6.17393245e-01, -2.46104661e+00],
       [-6.86745249e-01,  1.88958393e+00],
       [ 2.98196418e+00, -2.54158633e+00],
       [-1.64225759e+00, -2.72675084e+00],
       [ 2.00343318e+00,  3.94657497e+00],
       [ 1.51427846e+00,  4.28549215e+00],
       [ 6.56748405e-01,  3.68760769e+00],
       [ 1.21725829e+00,  3.15360284e+00],
       [-2.08668026e+00, -4.18493184e+00],
       [ 1.44900287e+00,  4.51826291e+00],
       [ 1.55440738e+00,  3.51244753e+00],
       [-2.

Create a dataset that contains all projections

In [205]:
projections = pd.DataFrame(index=df2_scaled.index)

In [206]:
projections['MDS_x'] = coords_MDS[:,0]
projections['MDS_y'] = coords_MDS[:,1]

In [207]:
projections

Unnamed: 0_level_0,MDS_x,MDS_y
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,2.747632,6.054889
Albania,-0.709703,-2.662238
Algeria,1.121951,-0.353110
Angola,2.536802,6.051507
Argentina,-1.450097,-2.501851
...,...,...
Vietnam,2.410722,-0.408029
West Bank and Gaza,1.104682,-0.784360
"Yemen, Rep.",3.085812,2.759421
Zambia,2.851664,3.710203


Add country information to the projections dataset 

In [208]:
projections = projections.merge(countries, left_index=True, right_on="country")

In [209]:
projections.head()

Unnamed: 0,MDS_x,MDS_y,country,iso_alpha,continent
0,2.747632,6.054889,Afghanistan,AFG,Asia
12,-0.709703,-2.662238,Albania,ALB,Europe
24,1.121951,-0.35311,Algeria,DZA,Africa
36,2.536802,6.051507,Angola,AGO,Africa
48,-1.450097,-2.501851,Argentina,ARG,Americas


In [210]:
fig = px.scatter(projections, x="MDS_x", y="MDS_y", hover_name="country", 
                  color="continent",
                 width=800, height=800, 
                 title="MDS projection of the country trajectories")

fig.update_yaxes(
    scaleanchor = "x",
    scaleratio = 1,
  )



fig.show()

### Project the countries using TSNE

In [211]:
from sklearn.manifold import TSNE

In [212]:
tsne = TSNE(perplexity=30)

tsne_coords = tsne.fit_transform(df2_scaled)

projections['tsne_x'] = tsne_coords[:,0]
projections['tsne_y'] = tsne_coords[:,1]



The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



In [213]:
projections.head()

Unnamed: 0,MDS_x,MDS_y,country,iso_alpha,continent,tsne_x,tsne_y
0,2.747632,6.054889,Afghanistan,AFG,Asia,-6.123759,-7.739768
12,-0.709703,-2.662238,Albania,ALB,Europe,9.789649,3.811364
24,1.121951,-0.35311,Algeria,DZA,Africa,3.762601,-0.42865
36,2.536802,6.051507,Angola,AGO,Africa,-5.838491,-7.917568
48,-1.450097,-2.501851,Argentina,ARG,Americas,9.321169,6.12031


In [214]:
fig = px.scatter(projections, x="tsne_x", y="tsne_y", hover_name="iso_alpha", color="continent",
                 width=800, height=800, 
                title="TNSE projection of the country trajectories")


fig.update_yaxes(
    scaleanchor = "x",
    scaleratio = 1,
  )



fig.show()

In [215]:
df.population.min()

60011

In [216]:
df.lifeExp.min()

23.599

In [217]:
projections.to_clipboard()

### Select a number of countries from different portions of the MDS visualize their paths

In [218]:
tmp = df.query("iso_alpha in ['GHA', 'MRT',  'AUS', 'DEU',  'LKA', 'JPN', 'ITA', 'SAU', 'USA','IND']")
#tmp contains only a portion of the dataset with only the countries in the list above 

fig = px.line(tmp, x="population", y="lifeExp", color="country", 
             title="selected country trajctories from different portions of the MDS and TSNE charts")# text="year")
fig.update_traces(textposition="bottom right")

fig.update_xaxes(
    range=[0,1500000000],  # sets the range of xaxis

)
fig.update_yaxes(
    range=[0,85],  # sets the range of xaxis

)

fig.show()

In [219]:
tmp = df.query("iso_alpha in ['GHA', 'MRT',  'AUS', 'DEU',  'LKA', 'JPN', 'ITA', 'SAU', 'USA','IND']")
#tmp contains only a portion of the dataset with only the countries in the list above 

fig = px.line(tmp, x="population", y="lifeExp", color="country", log_x=True, 
             title="country trajectories using a logaritmic x axis")# text="year")
fig.update_traces(textposition="bottom right")


fig.show()

## Try Clustering 

In [220]:
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering

The optimal number of clusters is something to investigate. Here we will just try clustering with some plausible values 

# Applying the spectral clustering algorithm

In [221]:
clustering = SpectralClustering(n_clusters=6,
         assign_labels='discretize',
         random_state=0).fit(df2_scaled)
clustering.labels_

array([1, 3, 1, 1, 3, 5, 5, 3, 5, 5, 0, 0, 3, 0, 3, 3, 1, 1, 0, 0, 5, 1,
       0, 3, 2, 3, 0, 1, 0, 3, 0, 3, 3, 5, 5, 0, 1, 1, 1, 1, 1, 1, 1, 5,
       5, 0, 1, 5, 0, 5, 1, 1, 1, 0, 1, 5, 3, 5, 5, 5, 1, 0, 5, 5, 5, 3,
       5, 1, 0, 3, 3, 3, 0, 1, 1, 0, 1, 3, 1, 0, 3, 3, 0, 3, 1, 1, 0, 0,
       0, 5, 5, 1, 1, 1, 5, 1, 5, 3, 3, 1, 1, 3, 3, 5, 3, 3, 1, 0, 1, 0,
       3, 1, 3, 3, 5, 1, 0, 5, 3, 0, 0, 5, 5, 1, 3, 0, 1, 0, 3, 1, 1, 0,
       5, 5, 3, 3, 1, 1, 0, 0, 0])

The algorithm attaches cluster lable to each country 

In [222]:
projections['scluster6'] = clustering.labels_

In [223]:
projections['scluster6'] =  projections['scluster6'].astype('str')

In [224]:
projections

Unnamed: 0,MDS_x,MDS_y,country,iso_alpha,continent,tsne_x,tsne_y,scluster6
0,2.747632,6.054889,Afghanistan,AFG,Asia,-6.123759,-7.739768,1
12,-0.709703,-2.662238,Albania,ALB,Europe,9.789649,3.811364,3
24,1.121951,-0.353110,Algeria,DZA,Africa,3.762601,-0.428650,1
36,2.536802,6.051507,Angola,AGO,Africa,-5.838491,-7.917568,1
48,-1.450097,-2.501851,Argentina,ARG,Americas,9.321169,6.120310,3
...,...,...,...,...,...,...,...,...
1632,2.410722,-0.408029,Vietnam,VNM,Asia,2.512561,0.393656,1
1644,1.104682,-0.784360,West Bank and Gaza,PSE,Asia,4.896789,-0.938667,1
1656,3.085812,2.759421,"Yemen, Rep.",YEM,Asia,-1.966498,-6.824316,0
1668,2.851664,3.710203,Zambia,ZMB,Africa,-4.995934,-4.512364,0


#### Let's try Kmeans with six clusters


In [225]:
kmeans = KMeans(n_clusters = 7, random_state = 0)

km2 = kmeans.fit(df2_scaled)

km2.labels_

array([6, 0, 3, 6, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 3, 0, 6, 6, 1, 1, 0, 6,
       1, 0, 5, 3, 1, 6, 1, 0, 1, 0, 0, 0, 0, 6, 3, 3, 3, 3, 6, 6, 6, 0,
       0, 1, 6, 0, 1, 0, 3, 6, 6, 1, 3, 0, 0, 0, 2, 3, 3, 3, 0, 0, 0, 0,
       4, 3, 1, 3, 3, 0, 1, 6, 3, 1, 6, 3, 6, 1, 0, 3, 3, 0, 3, 6, 1, 1,
       1, 0, 0, 3, 6, 6, 0, 3, 3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 6, 3, 3, 1,
       0, 6, 0, 0, 0, 6, 1, 0, 0, 1, 1, 0, 0, 3, 0, 1, 3, 1, 0, 3, 3, 1,
       0, 4, 0, 0, 3, 3, 1, 6, 1], dtype=int32)

In [226]:
projections['kmeans7'] = km2.labels_

In [227]:
projections.head()

Unnamed: 0,MDS_x,MDS_y,country,iso_alpha,continent,tsne_x,tsne_y,scluster6,kmeans7
0,2.747632,6.054889,Afghanistan,AFG,Asia,-6.123759,-7.739768,1,6
12,-0.709703,-2.662238,Albania,ALB,Europe,9.789649,3.811364,3,0
24,1.121951,-0.35311,Algeria,DZA,Africa,3.762601,-0.42865,1,3
36,2.536802,6.051507,Angola,AGO,Africa,-5.838491,-7.917568,1,6
48,-1.450097,-2.501851,Argentina,ARG,Americas,9.321169,6.12031,3,0


In [228]:
projections['kmeans7'] =  projections['kmeans7'].astype('str')

In [229]:
fig = px.scatter(projections, x="MDS_x", y="MDS_y",
                 hover_name="iso_alpha", color="scluster6",
                 width=800, height=800)


fig.update_yaxes(
    scaleanchor = "x",
    scaleratio = 1,
  )



fig.show()

In [230]:
fig = px.scatter(projections, x="MDS_x", y="MDS_y",
                 hover_name="iso_alpha", color="kmeans7",
                 width=800, height=800)


fig.update_yaxes(
    scaleanchor = "x",
    scaleratio = 1,
  )



fig.show()

In [231]:
tmp = df.query("iso_alpha in ['MOZ', 'COM', 'IRN', 'JAM',  'ESP', 'NLD', 'USA','IND']")

fig = px.line(tmp, x="population", y="lifeExp", color="country",  log_x=True, width=800, height=400)

fig.update_traces(textposition="bottom right")

fig.show()

### Calculate Cluster Centroids 

#### Add cluster information to the initial dataset

In [232]:
df.head(20)

Unnamed: 0,country,continent,year,lifeExp,population,gdpPercap,iso_alpha,iso_num
0,Afghanistan,Asia,1952,28.801,8425333,779.445314,AFG,4
1,Afghanistan,Asia,1957,30.332,9240934,820.85303,AFG,4
2,Afghanistan,Asia,1962,31.997,10267083,853.10071,AFG,4
3,Afghanistan,Asia,1967,34.02,11537966,836.197138,AFG,4
4,Afghanistan,Asia,1972,36.088,13079460,739.981106,AFG,4
5,Afghanistan,Asia,1977,38.438,14880372,786.11336,AFG,4
6,Afghanistan,Asia,1982,39.854,12881816,978.011439,AFG,4
7,Afghanistan,Asia,1987,40.822,13867957,852.395945,AFG,4
8,Afghanistan,Asia,1992,41.674,16317921,649.341395,AFG,4
9,Afghanistan,Asia,1997,41.763,22227415,635.341351,AFG,4


In [233]:
df3 = df.merge(projections[['country', 'scluster6']], on="country")

In [234]:
df3.head(20)

Unnamed: 0,country,continent,year,lifeExp,population,gdpPercap,iso_alpha,iso_num,scluster6
0,Afghanistan,Asia,1952,28.801,8425333,779.445314,AFG,4,1
1,Afghanistan,Asia,1957,30.332,9240934,820.85303,AFG,4,1
2,Afghanistan,Asia,1962,31.997,10267083,853.10071,AFG,4,1
3,Afghanistan,Asia,1967,34.02,11537966,836.197138,AFG,4,1
4,Afghanistan,Asia,1972,36.088,13079460,739.981106,AFG,4,1
5,Afghanistan,Asia,1977,38.438,14880372,786.11336,AFG,4,1
6,Afghanistan,Asia,1982,39.854,12881816,978.011439,AFG,4,1
7,Afghanistan,Asia,1987,40.822,13867957,852.395945,AFG,4,1
8,Afghanistan,Asia,1992,41.674,16317921,649.341395,AFG,4,1
9,Afghanistan,Asia,1997,41.763,22227415,635.341351,AFG,4,1


#### Calculate the average life expectancy and GDP per capita per year

In [235]:
cluster_centroidss = df3[['year', 'scluster6', 'lifeExp',
                         'population']].groupby(['scluster6', 'year']).mean().reset_index()

In [236]:
cluster_centroidss

Unnamed: 0,scluster6,year,lifeExp,population
0,0,1952,40.398788,3989774.0
1,0,1957,42.707242,4443013.0
2,0,1962,44.939879,4991047.0
3,0,1967,47.231727,5657290.0
4,0,1972,49.398455,6418620.0
5,0,1977,51.434667,7284743.0
6,0,1982,54.255939,8335121.0
7,0,1987,56.140273,9679854.0
8,0,1992,56.727,11038720.0
9,0,1997,55.651848,12436870.0


In [237]:
fig = px.line(cluster_centroidss, x="population", y="lifeExp", color="scluster6",  log_x=True, 
             title="Cluster Trajectories. Each line represents the population and life expectancy by year",
             width=800, height=600)

fig.update_traces(textposition="bottom right")


fig.show()

In [238]:
projections

Unnamed: 0,MDS_x,MDS_y,country,iso_alpha,continent,tsne_x,tsne_y,scluster6,kmeans7
0,2.747632,6.054889,Afghanistan,AFG,Asia,-6.123759,-7.739768,1,6
12,-0.709703,-2.662238,Albania,ALB,Europe,9.789649,3.811364,3,0
24,1.121951,-0.353110,Algeria,DZA,Africa,3.762601,-0.428650,1,3
36,2.536802,6.051507,Angola,AGO,Africa,-5.838491,-7.917568,1,6
48,-1.450097,-2.501851,Argentina,ARG,Americas,9.321169,6.120310,3,0
...,...,...,...,...,...,...,...,...,...
1632,2.410722,-0.408029,Vietnam,VNM,Asia,2.512561,0.393656,1,3
1644,1.104682,-0.784360,West Bank and Gaza,PSE,Asia,4.896789,-0.938667,1,3
1656,3.085812,2.759421,"Yemen, Rep.",YEM,Asia,-1.966498,-6.824316,0,1
1668,2.851664,3.710203,Zambia,ZMB,Africa,-4.995934,-4.512364,0,6


In [239]:
projections.to_csv("gapminder_after1952_projections.csv")