In [20]:
# import packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Data loading and observing

In [3]:
# load the data
dataset = pd.read_csv('life_expectancy.csv')

dataset.head() # observe first few rows of the data

Unnamed: 0,Country,Year,Status,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,...,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling,Life expectancy
0,Afghanistan,2015,Developing,263.0,62,0.01,71.279624,65.0,1154,19.1,...,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1,65.0
1,Afghanistan,2014,Developing,271.0,64,0.01,73.523582,62.0,492,18.6,...,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0,59.9
2,Afghanistan,2013,Developing,268.0,66,0.01,73.219243,64.0,430,18.1,...,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9,59.9
3,Afghanistan,2012,Developing,272.0,69,0.01,78.184215,67.0,2787,17.6,...,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8,59.5
4,Afghanistan,2011,Developing,275.0,71,0.01,7.097109,68.0,3013,17.2,...,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5,59.2


In [4]:
dataset.describe() # see summary statistics

Unnamed: 0,Year,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling,Life expectancy
count,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0,2938.0
mean,2007.51872,164.725664,30.303948,4.546875,738.251295,83.022124,2419.59224,38.381178,42.035739,82.617767,5.924098,82.393125,1.742103,6611.523863,10230850.0,4.821886,4.852144,0.630362,12.009837,69.234717
std,4.613841,124.086215,117.926501,3.921946,1987.914858,22.996984,11467.272489,19.935375,160.445548,23.367166,2.40077,23.655562,5.077785,13296.603449,54022420.0,4.397621,4.485854,0.20514,3.265139,9.509115
min,2000.0,1.0,0.0,0.01,0.0,1.0,0.0,1.0,0.0,3.0,0.37,2.0,0.1,1.68135,34.0,0.1,0.1,0.0,0.0,36.3
25%,2004.0,74.0,0.0,1.0925,4.685343,82.0,0.0,19.4,0.0,78.0,4.37,78.0,0.1,580.486996,418917.2,1.6,1.6,0.50425,10.3,63.2
50%,2008.0,144.0,3.0,3.755,64.912906,92.0,17.0,43.5,4.0,93.0,5.755,93.0,0.1,1766.947595,1386542.0,3.3,3.3,0.677,12.3,72.1
75%,2012.0,227.0,22.0,7.39,441.534144,96.0,360.25,56.1,28.0,97.0,7.33,97.0,0.8,4779.40519,4584371.0,7.1,7.2,0.772,14.1,75.6
max,2015.0,723.0,1800.0,17.87,19479.91161,99.0,212183.0,87.3,2500.0,99.0,17.6,99.0,50.6,119172.7418,1293859000.0,27.7,28.6,0.948,20.7,89.0


In [5]:
dataset = dataset.drop(['Country'], axis=1) # drop the country column 


# split dataset into labels and features
labels = dataset.iloc[:, -1] #select all the rows (:), and access the last column (-1)

features = dataset.iloc[:, 0:-1] #select all the rows (:), and access columns from 0 to the last column

# Data Preprocessing

In [10]:
features = pd.get_dummies(features) # convert categorical to numerical using one-hot encoding

# split data into training and test sets
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.2, random_state=20)

In [13]:
# standardize
numerical_features = features.select_dtypes(include=['float64', 'int64']) # select all list of numerical data

numerical_columns = numerical_features.columns # set columns

# initializes a ColumnTransformer
ct = ColumnTransformer([("scale", StandardScaler(), numerical_columns)], remainder='passthrough') 

features_train_scaled = ct.fit_transform(features_train) #fit the standadizer to the training data 

features_test_scaled = ct.transform(features_test) #applied the trained standadizer on the test data


# Building the model

In [25]:
# building the model
my_model = Sequential(name = "Country_Life_Expectancy_Prediction")

input = InputLayer(input_shape = (features.shape[1], )) # create input layer
my_model.add(input) # add input layer to model

my_model.add(Dense(64, activation = "relu")) # create and add hidden layer 

my_model.add(Dense(1)) # add output layer with one neuron 4 1 output

# Initializing the optimizer and compiling the model

In [26]:
opt = Adam(learning_rate = 0.01) # initializes an Adam optimizer

my_model.compile(loss='mse',  metrics=['mae'], optimizer=opt) # compile the model


# Fit and evaluate the model

In [28]:
# fit the model
my_model.fit(features_train_scaled, labels_train, epochs = 40, batch_size = 1, verbose = 1)

# evaluate model
res_mse, res_mae = my_model.evaluate(features_test_scaled, labels_test, verbose = 0)

print(my_model.summary()) # print summary of the sequential model

print("MSE: ", res_mse) # print root mean squared errors 

print("MAE: ", res_mae) # print mean absolute errors

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Model: "Country_Life_Expectancy_Prediction"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 64)                1408      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 65        
Total params: 1,473
Trainable params: 1,473
Non-trainable params: 0
_________________________________________________________________
None
MSE:  7.56873321