# Data Exploration

In [1]:
# global imports
import numpy as np
import pandas as pd
import altair as alt
from sklearn.preprocessing import StandardScaler

# supress scientific notation
pd.options.display.float_format = '{:.2f}'.format
# show all columns
pd.set_option('display.max_columns', None)

## Import Data

In [2]:
# import data
%store -r dfs

In [3]:
# output train
train = dfs[0]
train.head()

Unnamed: 0,Year,Status,LifeExpectancy,Alcohol,PercentExpenditure,HepatitisB,Measles,BMI,Polio,TotalExpenditure,Diphtheria,HIV/AIDS,GDP,Population,Thinness1_19,Thinness5_9,IncomeComposition,Schooling,Region
2453,2001-01-01,Developing,72.7,1.68,53.61,98.0,309,14.6,98.0,3.81,98.0,0.1,837.7,18797.0,15.4,15.6,0.69,12.5,South Asia
377,2006-01-01,Developing,76.3,0.67,24.98,99.0,0,32.2,94.0,2.24,97.0,0.1,398.97,609480.5,6.3,5.8,0.84,14.3,East Asia & Pacific
1802,2009-01-01,Developing,62.4,7.99,575.67,83.5,4076,31.0,83.0,8.5,83.0,8.7,4153.5,21374.0,1.9,1.9,0.6,11.4,Sub-Saharan Africa
2786,2005-01-01,Developing,52.2,3.94,0.0,9.0,23,18.1,91.0,4.66,9.0,10.0,459.37,1681495.0,7.7,7.6,0.44,9.2,Sub-Saharan Africa
2583,2015-01-01,Developing,75.7,1.74,0.0,92.0,1,6.3,92.0,6.85,91.0,0.1,5330.86,1518566.0,2.1,2.1,0.75,12.9,Europe & Central Asia


In [4]:
# output test
test = dfs[1]
test.head()

Unnamed: 0,Year,Status,LifeExpectancy,Alcohol,PercentExpenditure,HepatitisB,Measles,BMI,Polio,TotalExpenditure,Diphtheria,HIV/AIDS,GDP,Population,Thinness1_19,Thinness5_9,IncomeComposition,Schooling,Region
2008,2012-01-01,Developing,74.9,5.14,885.99,95.0,0,53.6,94.0,5.18,95.0,0.1,6387.79,3158966.0,1.1,1.1,0.72,13.4,Latin America & Caribbean
825,2007-01-01,Developing,71.2,2.83,52.3,99.0,0,5.8,99.0,6.32,99.0,0.3,334.84,683475.0,1.8,1.7,0.66,12.9,Latin America & Caribbean
605,2002-01-01,Developing,59.5,0.08,29.42,54.5,0,18.0,98.0,3.39,89.0,0.1,433.27,569479.0,7.7,7.7,0.0,8.8,Sub-Saharan Africa
2844,2011-01-01,Developing,71.2,0.85,457.97,63.0,0,49.9,65.0,3.85,65.0,0.1,3275.92,241871.0,1.5,1.4,0.59,10.8,East Asia & Pacific
1861,2014-01-01,Developing,74.5,3.55,473.12,98.0,0,53.2,99.0,9.4,98.0,0.1,1975.46,613997.0,1.8,1.7,0.64,11.6,Latin America & Caribbean


## Data Visualization

### Time Series of `Year` Grouped by `Region`

In [5]:
# create line chart
alt.Chart(train).mark_line().encode(
    x=alt.X('Year:T',axis=alt.Axis(format=('%Y'))),
    y=alt.Y('mean(LifeExpectancy):Q',scale=alt.Scale(zero=False)),
    color='Region'
).properties(
    width=600,
    height=300
).display()

The time series plot shows that life expectancy increases slightly as time increases. On a broader scale, life expectancy is definitely correlated with time as new medicines and new technologies get created.

However, for the scope of this project, I chose to assume that the data is uncorrelated with time and ignore the `Year` column. This assumption lets me build machine learning models. 

In [6]:
# drop year column
train.drop(columns=['Year'],inplace=True)
# output data frame
train.head()

Unnamed: 0,Status,LifeExpectancy,Alcohol,PercentExpenditure,HepatitisB,Measles,BMI,Polio,TotalExpenditure,Diphtheria,HIV/AIDS,GDP,Population,Thinness1_19,Thinness5_9,IncomeComposition,Schooling,Region
2453,Developing,72.7,1.68,53.61,98.0,309,14.6,98.0,3.81,98.0,0.1,837.7,18797.0,15.4,15.6,0.69,12.5,South Asia
377,Developing,76.3,0.67,24.98,99.0,0,32.2,94.0,2.24,97.0,0.1,398.97,609480.5,6.3,5.8,0.84,14.3,East Asia & Pacific
1802,Developing,62.4,7.99,575.67,83.5,4076,31.0,83.0,8.5,83.0,8.7,4153.5,21374.0,1.9,1.9,0.6,11.4,Sub-Saharan Africa
2786,Developing,52.2,3.94,0.0,9.0,23,18.1,91.0,4.66,9.0,10.0,459.37,1681495.0,7.7,7.6,0.44,9.2,Sub-Saharan Africa
2583,Developing,75.7,1.74,0.0,92.0,1,6.3,92.0,6.85,91.0,0.1,5330.86,1518566.0,2.1,2.1,0.75,12.9,Europe & Central Asia


In [7]:
# drop year column
test.drop(columns=['Year'],inplace=True)
# output data frame
test.head()

Unnamed: 0,Status,LifeExpectancy,Alcohol,PercentExpenditure,HepatitisB,Measles,BMI,Polio,TotalExpenditure,Diphtheria,HIV/AIDS,GDP,Population,Thinness1_19,Thinness5_9,IncomeComposition,Schooling,Region
2008,Developing,74.9,5.14,885.99,95.0,0,53.6,94.0,5.18,95.0,0.1,6387.79,3158966.0,1.1,1.1,0.72,13.4,Latin America & Caribbean
825,Developing,71.2,2.83,52.3,99.0,0,5.8,99.0,6.32,99.0,0.3,334.84,683475.0,1.8,1.7,0.66,12.9,Latin America & Caribbean
605,Developing,59.5,0.08,29.42,54.5,0,18.0,98.0,3.39,89.0,0.1,433.27,569479.0,7.7,7.7,0.0,8.8,Sub-Saharan Africa
2844,Developing,71.2,0.85,457.97,63.0,0,49.9,65.0,3.85,65.0,0.1,3275.92,241871.0,1.5,1.4,0.59,10.8,East Asia & Pacific
1861,Developing,74.5,3.55,473.12,98.0,0,53.2,99.0,9.4,98.0,0.1,1975.46,613997.0,1.8,1.7,0.64,11.6,Latin America & Caribbean


### Counts of `Status` Levels

In [8]:
# create table
pd.DataFrame(train.groupby('Status')['Status'].agg('count')).rename(columns={'Status':'Count'})

Unnamed: 0_level_0,Count
Status,Unnamed: 1_level_1
Developed,378
Developing,1824


### Counts of `Region` Levels

***Table***

In [9]:
# create table
pd.DataFrame(train.groupby('Region')['Region'].agg('count')).rename(columns={'Region':'Count'})

Unnamed: 0_level_0,Count
Region,Unnamed: 1_level_1
East Asia & Pacific,301
Europe & Central Asia,576
Latin America & Caribbean,379
Middle East & North Africa,239
North America,25
South Asia,100
Sub-Saharan Africa,582


***Bar Chart***

In [10]:
# create bar chart
alt.Chart(train,title='Bar Chart of Region').mark_bar().encode(
    x=alt.X('Region:N',sort='y'),
    y='count():Q'
).properties(
    width=500,
    height=300
).display()

### Distribution of Numeric Variables

In [11]:
# output distribution of numeric variables
train.describe()

Unnamed: 0,LifeExpectancy,Alcohol,PercentExpenditure,HepatitisB,Measles,BMI,Polio,TotalExpenditure,Diphtheria,HIV/AIDS,GDP,Population,Thinness1_19,Thinness5_9,IncomeComposition,Schooling
count,2202.0,2202.0,2202.0,2202.0,2202.0,2202.0,2202.0,2202.0,2202.0,2202.0,2202.0,2202.0,2202.0,2202.0,2202.0,2202.0
mean,69.17,4.57,765.45,79.65,2584.45,37.87,82.58,5.88,82.26,1.73,7008.78,9942215.15,4.96,4.97,0.63,11.93
std,9.51,4.04,2070.02,25.22,12366.88,19.95,23.32,2.46,23.72,5.02,13840.34,49966587.11,4.47,4.54,0.21,3.29
min,36.3,0.01,0.0,1.0,0.0,1.9,3.0,0.37,2.0,0.1,1.68,34.0,0.1,0.1,0.0,0.0
25%,63.2,0.91,5.31,74.0,0.0,19.3,78.0,4.27,78.0,0.1,459.37,463108.25,1.6,1.6,0.48,10.0
50%,72.1,3.64,64.85,91.0,19.0,42.6,93.0,5.67,93.0,0.1,1783.72,1518566.0,3.5,3.4,0.67,12.2
75%,75.7,7.57,430.45,96.0,394.0,55.9,97.0,7.46,97.0,0.8,5330.86,4652812.0,7.38,7.4,0.78,14.3
max,89.0,17.87,19479.91,99.0,212183.0,87.3,99.0,17.6,99.0,50.6,119172.74,1293859294.0,27.7,28.6,0.94,20.7


In [12]:
# create lists
lst_of_charts = []
columns = train.select_dtypes(include=np.number).columns.tolist()
# create charts
for col in columns:
    chart = alt.Chart(train, title='Histogram of '+str(col)).mark_bar().encode(
        alt.X(col,bin=True),
        y='count():Q'
    ).properties(
        width=165,
        height=165
    )
    lst_of_charts.append(chart)
# output charts
alt.vconcat((lst_of_charts[0]|lst_of_charts[1]|lst_of_charts[2]|lst_of_charts[3]),
            (lst_of_charts[4]|lst_of_charts[5]|lst_of_charts[6]|lst_of_charts[7]),
            (lst_of_charts[8]|lst_of_charts[9]|lst_of_charts[10]|lst_of_charts[11]),
            (lst_of_charts[12]|lst_of_charts[13]|lst_of_charts[14]))

Outliers in variables like `PercentExpenditure`, `Measles`, `Polio`, `HIV/AIDS`, `GDP`, and `Population` may influence analysis. However, I will keep all outliers because it is best to build robust models.

### Violin Plot of `Status` and `LifeExpectancy`

In [13]:
# create violin plot
alt.Chart(train,title='Violin Plot of Status and Life Expectancy').transform_density(
    'LifeExpectancy',
    as_=['LifeExpectancy', 'density'],
    groupby=['Status']
).mark_area(orient='horizontal').encode(
    y=alt.Y('LifeExpectancy:Q',title='Life Expectancy (Years)'),
    color='Status:N',
    x=alt.X(
        'density:Q',
        stack='center',
        impute=None,
        title=None,
        axis=alt.Axis(labels=False, values=[0],grid=False, ticks=True),
    ),
    column=alt.Column(
        'Status:N',
        header=alt.Header(
            titleOrient='bottom',
            labelOrient='bottom',
            labelPadding=0,
        ),
    )
).properties(
    height=300
).configure_facet(
    spacing=0
).configure_view(
    stroke=None
).display()

The violin plot shows that life expectancy is lower for developing nations.

### Density Facet of `Region` and `LifeExpectancy`

In [14]:
# create density facet
alt.Chart(train, title='Life Expectancy by Region').transform_density(
    density='LifeExpectancy',
    as_=['LifeExpectancy','density'],
    groupby=['Region']
).mark_area().encode(
    x=alt.X('LifeExpectancy:Q',title='Life Expectancy'),
    y=alt.Y('density:Q'),facet=alt.Facet('Region:N', columns=4)
).properties(
    height=175,
    width=175
).display()

### Correlation Heatmap of Numeric Variables

In [15]:
# create correlation data
corr_df = train.drop(columns=['Status','Region'])\
               .corr()\
               .stack()\
               .reset_index()\
               .rename(columns={0:'correlation','level_0':'var1','level_1':'var2'})
# output data frame
corr_df.head()

Unnamed: 0,var1,var2,correlation
0,LifeExpectancy,LifeExpectancy,1.0
1,LifeExpectancy,Alcohol,0.4
2,LifeExpectancy,PercentExpenditure,0.38
3,LifeExpectancy,HepatitisB,0.3
4,LifeExpectancy,Measles,-0.17


In [16]:
# create heatmap
alt.Chart(corr_df,title='Correlation Heatmap').mark_rect().encode(
    x=alt.X('var1:N',title='Numeric Variables'),
    y=alt.Y('var2:N',title='Numeric Variables'),
    color=alt.Color('correlation:Q',scale=alt.Scale(scheme='redblue'))
).properties(
    height=350,
    width=350
).display()

Life expectancy has a negative correlation with HIV/AIDS, measles, population size, and thinness. Life expectancy has a positive correlation with alcohol consumption, BMI, diphtheria immunization coverage, GDP, income composition of resources, expenditure on health as a percent of GDP, polio immunization coverage, and schooling. 

#### Examine Variables Related to Weight

In [17]:
# output the correlations of these variables
corr_df[((corr_df['var1']=='Thinness1_19')|(corr_df['var1']=='Thinness5_9')) 
        & ((corr_df['var2']=='Thinness5_9')|(corr_df['var2']=='BMI')|(corr_df['var2']=='LifeExpectancy'))]

Unnamed: 0,var1,var2,correlation
192,Thinness1_19,LifeExpectancy,-0.48
197,Thinness1_19,BMI,-0.54
205,Thinness1_19,Thinness5_9,0.94
208,Thinness5_9,LifeExpectancy,-0.47
213,Thinness5_9,BMI,-0.54
221,Thinness5_9,Thinness5_9,1.0


I examined `Thinness1_19`, `Thinness5_9`, and `BMI` because these variables relate to weight so they may be redundant `Thinness1_19` and `Thinness5_9` have a strong, positive correlation so only one of these variables should be included in the model. `Thinness1_19` has a higher correlation with `LifeExpectancy`, so I will keep this variable. 

In [18]:
# drop Thinness5_9 column
train.drop(columns=['Thinness5_9'],inplace=True)
# output data frame
train.head()

Unnamed: 0,Status,LifeExpectancy,Alcohol,PercentExpenditure,HepatitisB,Measles,BMI,Polio,TotalExpenditure,Diphtheria,HIV/AIDS,GDP,Population,Thinness1_19,IncomeComposition,Schooling,Region
2453,Developing,72.7,1.68,53.61,98.0,309,14.6,98.0,3.81,98.0,0.1,837.7,18797.0,15.4,0.69,12.5,South Asia
377,Developing,76.3,0.67,24.98,99.0,0,32.2,94.0,2.24,97.0,0.1,398.97,609480.5,6.3,0.84,14.3,East Asia & Pacific
1802,Developing,62.4,7.99,575.67,83.5,4076,31.0,83.0,8.5,83.0,8.7,4153.5,21374.0,1.9,0.6,11.4,Sub-Saharan Africa
2786,Developing,52.2,3.94,0.0,9.0,23,18.1,91.0,4.66,9.0,10.0,459.37,1681495.0,7.7,0.44,9.2,Sub-Saharan Africa
2583,Developing,75.7,1.74,0.0,92.0,1,6.3,92.0,6.85,91.0,0.1,5330.86,1518566.0,2.1,0.75,12.9,Europe & Central Asia


In [19]:
# drop Thinness5_9 column
test.drop(columns=['Thinness5_9'],inplace=True)
# output data frame
test.head()

Unnamed: 0,Status,LifeExpectancy,Alcohol,PercentExpenditure,HepatitisB,Measles,BMI,Polio,TotalExpenditure,Diphtheria,HIV/AIDS,GDP,Population,Thinness1_19,IncomeComposition,Schooling,Region
2008,Developing,74.9,5.14,885.99,95.0,0,53.6,94.0,5.18,95.0,0.1,6387.79,3158966.0,1.1,0.72,13.4,Latin America & Caribbean
825,Developing,71.2,2.83,52.3,99.0,0,5.8,99.0,6.32,99.0,0.3,334.84,683475.0,1.8,0.66,12.9,Latin America & Caribbean
605,Developing,59.5,0.08,29.42,54.5,0,18.0,98.0,3.39,89.0,0.1,433.27,569479.0,7.7,0.0,8.8,Sub-Saharan Africa
2844,Developing,71.2,0.85,457.97,63.0,0,49.9,65.0,3.85,65.0,0.1,3275.92,241871.0,1.5,0.59,10.8,East Asia & Pacific
1861,Developing,74.5,3.55,473.12,98.0,0,53.2,99.0,9.4,98.0,0.1,1975.46,613997.0,1.8,0.64,11.6,Latin America & Caribbean


## Principal Component Analysis

### Indicator Variables of Categorical Features

In [20]:
# create a set of indicator variables for categorical variable Region
train_ohe = pd.get_dummies(train, columns=['Region','Status'], drop_first=True)
train_ohe.head()

Unnamed: 0,LifeExpectancy,Alcohol,PercentExpenditure,HepatitisB,Measles,BMI,Polio,TotalExpenditure,Diphtheria,HIV/AIDS,GDP,Population,Thinness1_19,IncomeComposition,Schooling,Region_Europe & Central Asia,Region_Latin America & Caribbean,Region_Middle East & North Africa,Region_North America,Region_South Asia,Region_Sub-Saharan Africa,Status_Developing
2453,72.7,1.68,53.61,98.0,309,14.6,98.0,3.81,98.0,0.1,837.7,18797.0,15.4,0.69,12.5,0,0,0,0,1,0,1
377,76.3,0.67,24.98,99.0,0,32.2,94.0,2.24,97.0,0.1,398.97,609480.5,6.3,0.84,14.3,0,0,0,0,0,0,1
1802,62.4,7.99,575.67,83.5,4076,31.0,83.0,8.5,83.0,8.7,4153.5,21374.0,1.9,0.6,11.4,0,0,0,0,0,1,1
2786,52.2,3.94,0.0,9.0,23,18.1,91.0,4.66,9.0,10.0,459.37,1681495.0,7.7,0.44,9.2,0,0,0,0,0,1,1
2583,75.7,1.74,0.0,92.0,1,6.3,92.0,6.85,91.0,0.1,5330.86,1518566.0,2.1,0.75,12.9,1,0,0,0,0,0,1


In [21]:
# create a set of indicator variables for categorical variable Region
test_ohe = pd.get_dummies(test, columns=['Region','Status'], drop_first=True)
test_ohe.head()

Unnamed: 0,LifeExpectancy,Alcohol,PercentExpenditure,HepatitisB,Measles,BMI,Polio,TotalExpenditure,Diphtheria,HIV/AIDS,GDP,Population,Thinness1_19,IncomeComposition,Schooling,Region_Europe & Central Asia,Region_Latin America & Caribbean,Region_Middle East & North Africa,Region_North America,Region_South Asia,Region_Sub-Saharan Africa,Status_Developing
2008,74.9,5.14,885.99,95.0,0,53.6,94.0,5.18,95.0,0.1,6387.79,3158966.0,1.1,0.72,13.4,0,1,0,0,0,0,1
825,71.2,2.83,52.3,99.0,0,5.8,99.0,6.32,99.0,0.3,334.84,683475.0,1.8,0.66,12.9,0,1,0,0,0,0,1
605,59.5,0.08,29.42,54.5,0,18.0,98.0,3.39,89.0,0.1,433.27,569479.0,7.7,0.0,8.8,0,0,0,0,0,1,1
2844,71.2,0.85,457.97,63.0,0,49.9,65.0,3.85,65.0,0.1,3275.92,241871.0,1.5,0.59,10.8,0,0,0,0,0,0,1
1861,74.5,3.55,473.12,98.0,0,53.2,99.0,9.4,98.0,0.1,1975.46,613997.0,1.8,0.64,11.6,0,1,0,0,0,0,1


### Split Data into Features and Response

In [22]:
# train features and response
train_X = train_ohe.drop(columns=['LifeExpectancy']).copy()
train_y = train_ohe[['LifeExpectancy']].copy()

In [23]:
# test features and response
test_X = test_ohe.drop(columns=['LifeExpectancy']).copy()
test_y = test_ohe[['LifeExpectancy']].copy()

### Scale Data Frame

I made each column have a mean of 0 and a standard deviation of 1 by centering and scaling the data in each column: $z_{Ai}=\frac{x_{Ai}-\mu_A}{\sigma_A}$. To aviod data leakage, I used the mean and standard deviation of the train data on the test set. 

In [24]:
# fit standard scaler
scaler = StandardScaler()
scaler.fit(train_X)
# standardize train data
train_scaled_series = scaler.transform(train_X)
train_X_scaled = pd.DataFrame(train_scaled_series, index=train_X.index, columns=train_X.columns)
# output data frame
train_X_scaled.head()

Unnamed: 0,Alcohol,PercentExpenditure,HepatitisB,Measles,BMI,Polio,TotalExpenditure,Diphtheria,HIV/AIDS,GDP,Population,Thinness1_19,IncomeComposition,Schooling,Region_Europe & Central Asia,Region_Latin America & Caribbean,Region_Middle East & North Africa,Region_North America,Region_South Asia,Region_Sub-Saharan Africa,Status_Developing
2453,-0.72,-0.34,0.73,-0.18,-1.17,0.66,-0.84,0.66,-0.32,-0.45,-0.2,2.33,0.29,0.17,-0.6,-0.46,-0.35,-0.11,4.58,-0.6,0.46
377,-0.97,-0.36,0.77,-0.21,-0.28,0.49,-1.48,0.62,-0.32,-0.48,-0.19,0.3,1.0,0.72,-0.6,-0.46,-0.35,-0.11,-0.22,-0.6,0.46
1802,0.85,-0.09,0.15,0.12,-0.34,0.02,1.06,0.03,1.39,-0.21,-0.2,-0.68,-0.13,-0.16,-0.6,-0.46,-0.35,-0.11,-0.22,1.67,0.46
2786,-0.16,-0.37,-2.8,-0.21,-0.99,0.36,-0.5,-3.09,1.65,-0.47,-0.17,0.61,-0.87,-0.83,-0.6,-0.46,-0.35,-0.11,-0.22,1.67,0.46
2583,-0.7,-0.37,0.49,-0.21,-1.58,0.4,0.39,0.37,-0.32,-0.12,-0.17,-0.64,0.57,0.29,1.68,-0.46,-0.35,-0.11,-0.22,-0.6,0.46


In [25]:
# standardize train data
test_scaled_series = scaler.transform(test_X)
test_X_scaled = pd.DataFrame(test_scaled_series, index=test_X.index, columns=test_X.columns)
# output data frame
test_X_scaled.head()

Unnamed: 0,Alcohol,PercentExpenditure,HepatitisB,Measles,BMI,Polio,TotalExpenditure,Diphtheria,HIV/AIDS,GDP,Population,Thinness1_19,IncomeComposition,Schooling,Region_Europe & Central Asia,Region_Latin America & Caribbean,Region_Middle East & North Africa,Region_North America,Region_South Asia,Region_Sub-Saharan Africa,Status_Developing
2008,0.14,0.06,0.61,-0.21,0.79,0.49,-0.29,0.54,-0.32,-0.04,-0.14,-0.86,0.47,0.45,-0.6,2.19,-0.35,-0.11,-0.22,-0.6,0.46
825,-0.43,-0.34,0.77,-0.21,-1.61,0.7,0.18,0.71,-0.29,-0.48,-0.19,-0.71,0.15,0.29,-0.6,2.19,-0.35,-0.11,-0.22,-0.6,0.46
605,-1.11,-0.36,-1.0,-0.21,-1.0,0.66,-1.01,0.28,-0.32,-0.48,-0.19,0.61,-2.96,-0.95,-0.6,-0.46,-0.35,-0.11,-0.22,1.67,0.46
2844,-0.92,-0.15,-0.66,-0.21,0.6,-0.75,-0.83,-0.73,-0.32,-0.27,-0.19,-0.77,-0.16,-0.34,-0.6,-0.46,-0.35,-0.11,-0.22,-0.6,0.46
1861,-0.25,-0.14,0.73,-0.21,0.77,0.7,1.43,0.66,-0.32,-0.36,-0.19,-0.71,0.05,-0.1,-0.6,2.19,-0.35,-0.11,-0.22,-0.6,0.46


### Singular Value Decomposition

Decompose `train_X_scaled` into $\mathbb{X} = U\Sigma V^T$ where $\mathbb{X}$ is a $2202\times 21$ matrix, $U$ is a $2202\times r$ matrix, $\Sigma$ is a $r\times r$ matrix, and $V^T$ is a $r\times 21$ matrix.

In [26]:
# SVD
u,s,vt = np.linalg.svd(train_X_scaled, full_matrices=False)

In [27]:
u.shape

(2202, 21)

***Singular Value Decomposition and Principal Component Analysis***

The singular value decomposed matrix can be expressed as the principal component matrix $\mathbb{X}=\mathbb{Z}\Phi$ where $\mathbb{Z}=U\Sigma$ are the principal components and $\Phi=V$ are the loadings (weights).

### Scree Plots

***Total Variance***

The total variance is $Var(\mathbb{X})=\sum_{i=1}^p\frac{\lambda_i}{N}$ where $\lambda_i=\Sigma_i^2$ are the variances for each principal component. Note that $\sum_{i=1}^pVar(X_i)=\sum_{i=1}^pVar(Z_i)$

In [28]:
print('Total variance: ',np.sum(s**2)/len(train_X_scaled))

Total variance:  21.00000000000001


***Proportion of Variance Explained***

The proportion of variance explained by each principal component is $PVE_m=\frac{\lambda_m}{\sum_{j=1}^p\lambda_j}$.

In [29]:
# obtain proportion of variance explained
pc_var = pd.DataFrame({'Dimension':[i for i in range(1,22)],
                      'Proportion of Variance Explained':[i**2/sum(s**2) for i in s]})
# create a scree plot
alt.Chart(pc_var, title='Scree Plot').mark_line().encode(
    x = 'Dimension:N',
    y = 'Proportion of Variance Explained:Q'
).properties(
    width=500,
    height=300
).display()

Each principal component accounts for a small proporiton of the total variance. 

***Cumulative Proportion of Variance Explained***

The cumulative proportion of varaince explained by each principal component is $CumPVE_m=\frac{\sum_{i=1}^m\lambda_i}{\sum_{j=1}^p\lambda_j}$.

In [30]:
# obtain cumulative proportion of variance explained
cum_prop = []
for i in range(0,21):
    current = []
    for j in range(0,i+1):
        current.append(s[j]**2/sum(s**2))
    cum_prop.append(sum(current))
# create a data frame for dimension and cumulative proportion of variance explained
pc_var_cum = pd.DataFrame({'Dimension':[i for i in range(1,22)],
                           'Cumulative Proportion of Variance Explained':cum_prop})
# create a cumulative scree plot
alt.Chart(pc_var_cum, title='Cumulative Scree Plot').mark_line().encode(
    x = 'Dimension:N',
    y = 'Cumulative Proportion of Variance Explained:Q'
).properties(
    width=500,
    height=300
).display()

90% of the variance is explained by the first 13 principal components, and 95% of the variance is explained by the first 15 principal components. 

### Loadings

50% of the variance is explained by the first four principal components. I examined the loading matrix for these components.

In [31]:
# create data frame of first four loadings
PCs_4 = pd.DataFrame(columns=['PC','Variable','Loading'])
for k in range(0,4):
    pc = str(k+1)
    PCs_4 = pd.concat([PCs_4,pd.DataFrame({'PC':pc,'Variable':train_X_scaled.columns,'Loading':vt[k,:]})])

In [32]:
# create stacked bar chart
alt.Chart(PCs_4, title='Loadings of First Four PCs').mark_bar().encode(
    x='Variable',
    y='sum(Loading)',
    color='PC'
).properties(
    width=500,
    height=300
).display()

Most variables have significant loadings in all of the four principal components. Therefore, it seems like all variables are important for describing the variance. 

In [33]:
# obtain loadings for PC1
PC_1 = pd.DataFrame({'Variable':train_X_scaled.columns,'Loading':vt[0,:]})
# create chart
alt.Chart(PC_1, title='Loadings of First PC').mark_bar().encode(
    x='Variable',
    y='Loading',
).properties(
    width=500,
    height=300
).display()

Loadings with the same sign are positively correlated, and loadings with opposite signs are negatively correlated. The positive and negative loadings of the first principal component match the positive and negative correlations of each variable with `LifeExpectancy` from the correlation matrix. Therefore, the first principal component captures the correlation of each variable with life expectancy. 

Furthermore, the magnitude of the loadings is similar to the strength of the correlation. For instance, `Schooling` has the largest loading weight and it was highly correlated with `LifeExpectancy`.

### Dimension Reduction

***Reduced Dimension Dataset***

I considered reducing the dimension to first 13 principal components. To do this, I used the formula $\mathbb{X}V=U\Sigma$.

In [34]:
# reduced dimensional dataset
# X @ vt[:r,:].T also reduces the dimension of the dataset
PC_13_series = u[:,:13] @ np.diag(s[:13])
# convert to data frame
PC_13 = pd.DataFrame(PC_13_series, columns=['PC'+str(i) for i in range(1,14)])
# output data frame
PC_13.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13
0,-1.46,2.0,-3.82,-1.21,0.54,0.85,0.16,2.25,0.66,-0.39,1.62,-0.06,0.13
1,0.03,1.61,-0.4,0.04,-0.45,-0.48,0.25,-0.04,-0.12,0.97,1.25,-0.48,-0.24
2,-0.58,-1.0,0.97,-1.19,-0.92,0.86,-0.05,-0.86,0.41,-0.5,-0.14,-0.08,-0.3
3,-3.37,-2.48,0.87,0.46,-0.14,-0.2,-0.15,0.06,1.16,0.34,0.61,1.01,-1.4
4,0.57,0.38,-0.09,-0.9,-0.38,-0.55,-0.53,0.58,-0.59,0.5,-0.27,0.07,-0.77


***Heatmap of `PC_13`***

In [35]:
# create correlation data
PC_corr_df = PC_13.corr()\
               .stack()\
               .reset_index()\
               .rename(columns={0:'correlation','level_0':'var1','level_1':'var2'})
# output data frame
PC_corr_df.head()

Unnamed: 0,var1,var2,correlation
0,PC1,PC1,1.0
1,PC1,PC2,-0.0
2,PC1,PC3,0.0
3,PC1,PC4,-0.0
4,PC1,PC5,-0.0


In [36]:
# create heatmap
alt.Chart(PC_corr_df,title='Correlation Heatmap').mark_rect().encode(
    x=alt.X('var1:N',title='Numeric Variables'),
    y=alt.Y('var2:N',title='Numeric Variables'),
    color=alt.Color('correlation:Q',scale=alt.Scale(scheme='blues'))
).properties(
    height=350,
    width=350
).display()

The heatmap shows that all principal components are uncorrelated with each other. 

***Conclusion***

I chose not to reduce the dimension of the dataset for machine learning. The original dataset does not have many dimensions and all predictors are somewhat important for determining life expecatncy. Furthermore, the benefit of having a reduced dimensional dataset does not outweigh the cost of non-interpretable variables in this project.

## Store Data Frames

In [37]:
# list of dfs
scaled_dfs = [train_X_scaled,train_y,test_X_scaled,test_y]
# store dfs
%store scaled_dfs

Stored 'scaled_dfs' (list)
