In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm

# Use functions from .py file
%load_ext autoreload
%autoreload 2
import os
import sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import src.functions as funcs

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
df = funcs.create_df()

In [9]:
df.head()

Unnamed: 0,country,year,status,life_expectancy,adult_mortality,infant_deaths,alcohol,percentage_expenditure,hepatitis_b,measles,...,polio,total_expenditure,diphtheria,hiv/aids,gdp,population,thinness__1-19_years,thinness_5-9_years,income_composition_of_resources,schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


Create baseline model to predict `life_expectancy`

In [10]:
df.isna().sum()

country                              0
year                                 0
status                               0
life_expectancy                     10
adult_mortality                     10
infant_deaths                        0
alcohol                            194
percentage_expenditure               0
hepatitis_b                        553
measles                              0
bmi                                 34
under-five_deaths                    0
polio                               19
total_expenditure                  226
diphtheria                          19
hiv/aids                             0
gdp                                448
population                         652
thinness__1-19_years                34
thinness_5-9_years                  34
income_composition_of_resources    167
schooling                          163
dtype: int64

In [20]:
df_no_missing = df.dropna(subset=['life_expectancy','schooling', 'income_composition_of_resources'])

In [21]:
df_no_missing.isna().sum()

country                              0
year                                 0
status                               0
life_expectancy                      0
adult_mortality                      0
infant_deaths                        0
alcohol                            184
percentage_expenditure               0
hepatitis_b                        509
measles                              0
bmi                                 32
under-five_deaths                    0
polio                               19
total_expenditure                  186
diphtheria                          19
hiv/aids                             0
gdp                                286
population                         484
thinness__1-19_years                32
thinness_5-9_years                  32
income_composition_of_resources      0
schooling                            0
dtype: int64

In [22]:
df_no_missing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2768 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   country                          2768 non-null   object 
 1   year                             2768 non-null   int64  
 2   status                           2768 non-null   object 
 3   life_expectancy                  2768 non-null   float64
 4   adult_mortality                  2768 non-null   float64
 5   infant_deaths                    2768 non-null   int64  
 6   alcohol                          2584 non-null   float64
 7   percentage_expenditure           2768 non-null   float64
 8   hepatitis_b                      2259 non-null   float64
 9   measles                          2768 non-null   int64  
 10  bmi                              2736 non-null   float64
 11  under-five_deaths                2768 non-null   int64  
 12  polio               

Baseline model using `schooling`

In [23]:
y = df_no_missing['life_expectancy']
X = df_no_missing[['schooling']]

In [24]:
baseline_model = sm.OLS(y, sm.add_constant(X)).fit()
baseline_model.summary()

0,1,2,3
Dep. Variable:,life_expectancy,R-squared:,0.565
Model:,OLS,Adj. R-squared:,0.565
Method:,Least Squares,F-statistic:,3599.0
Date:,"Wed, 05 Oct 2022",Prob (F-statistic):,0.0
Time:,15:00:08,Log-Likelihood:,-8964.3
No. Observations:,2768,AIC:,17930.0
Df Residuals:,2766,BIC:,17940.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,44.1089,0.437,100.992,0.000,43.252,44.965
schooling,2.1035,0.035,59.995,0.000,2.035,2.172

0,1,2,3
Omnibus:,283.391,Durbin-Watson:,0.267
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1122.013
Skew:,-0.445,Prob(JB):,2.2799999999999997e-244
Kurtosis:,5.989,Cond. No.,46.7


Model iteration

In [25]:
y = df_no_missing['life_expectancy']
X = df_no_missing[['schooling', 'income_composition_of_resources']]

In [26]:
model2 = sm.OLS(y, sm.add_constant(X)).fit()
model2.summary()

0,1,2,3
Dep. Variable:,life_expectancy,R-squared:,0.608
Model:,OLS,Adj. R-squared:,0.607
Method:,Least Squares,F-statistic:,2141.0
Date:,"Wed, 05 Oct 2022",Prob (F-statistic):,0.0
Time:,15:00:41,Log-Likelihood:,-8823.1
No. Observations:,2768,AIC:,17650.0
Df Residuals:,2765,BIC:,17670.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,43.7724,0.416,105.330,0.000,42.958,44.587
schooling,1.3377,0.056,24.083,0.000,1.229,1.447
income_composition_of_resources,15.1822,0.881,17.233,0.000,13.455,16.910

0,1,2,3
Omnibus:,257.627,Durbin-Watson:,0.311
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1572.26
Skew:,-0.192,Prob(JB):,0.0
Kurtosis:,6.672,Cond. No.,99.0
