# MACHINE LEARING
A notebook to implement machine learning techniques.

# 1. Setup

In [4]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Disable warnings
import warnings
warnings.filterwarnings('ignore')

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import pandas as pd
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
import seaborn as sns

In [5]:
# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "02_machine_learning"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [6]:
# Where to save the models
import joblib

PROJECT_ROOT_DIR = "."
CHAPTER_ID = "02_machine_learning"
MODELS_PATH = os.path.join(PROJECT_ROOT_DIR, "models", CHAPTER_ID)
os.makedirs(MODELS_PATH, exist_ok=True)

def save_model(model, model_id):
    '''To save model in the corresponding directory'''
    joblib.dump(model, os.path.join(MODELS_PATH, model_id + "." + "pkl"))

def load_model(model_id):
    '''To load model from the corresponding directory'''
    return joblib.load(os.path.join(MODELS_PATH, model_id + "." + "pkl"))

**Note:** I will jump directly to the main dishes (pipelines and metrics) since the data cleaning and EDA have been caried already.

# 2. Create a Test Set

## 2.1. Load the Data

In [74]:
# Load the dataset
df = pd.read_csv("./dataset/life_expectancy.csv")
df.head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [77]:
# Rename some columns as their names contain trailing spaces
df.rename(columns={" BMI ":"BMI","Life expectancy ":"Life_Expectancy","Adult Mortality":"Adult_Mortality",
                   "infant deaths":"Infant_Deaths","percentage expenditure":"Percentage_Exp","Hepatitis B":"HepatitisB",
                  "Measles ":"Measles"," BMI ":"BMI","under-five deaths ":"Under_Five_Deaths","Diphtheria ":"Diphtheria",
                  " HIV/AIDS":"HIV/AIDS"," thinness  1-19 years":"thinness_1to19_years"," thinness 5-9 years":"thinness_5to9_years","Income composition of resources":"Income_Comp_Of_Resources",
                   "Total expenditure":"Tot_Exp"},inplace=True)
df.columns = [alias.lower() for alias in df.columns]
df.columns

Index(['country', 'year', 'status', 'life_expectancy', 'adult_mortality',
       'infant_deaths', 'alcohol', 'percentage_exp', 'hepatitisb', 'measles',
       'bmi', 'under_five_deaths', 'polio', 'tot_exp', 'diphtheria',
       'hiv/aids', 'gdp', 'population', 'thinness_1to19_years',
       'thinness_5to9_years', 'income_comp_of_resources', 'schooling'],
      dtype='object')

## 2.2. Create a Test Set

**Note:** The training set and test set will preserve as much as possible the distribution of features `hiv/aids`, `income_comp_of_resources`, `schooling` and `status` as they strongly correlare with `life_expectancy` [EDA].

In [78]:
# Check the number of null values in these features
df[["hiv/aids", "income_comp_of_resources", "schooling", "status"]].isnull().sum()

hiv/aids                      0
income_comp_of_resources    167
schooling                   163
status                        0
dtype: int64

In [79]:
# Save the original dataset
df_origin = df.copy()

In [80]:
# Categorize income composition of resources (HDI)
# source: https://en.wikipedia.org/wiki/Human_Development_Index
bins = pd.IntervalIndex.from_tuples([(-0.009, 0.549), (0.549, 0.699), (0.699, 0.799), (0.799, 1)])
df["income_comp_of_resources_cat"] = pd.cut(df["income_comp_of_resources"], bins=bins)
df["income_comp_of_resources_cat"].isnull().sum()

167

In [81]:
# Categorize hiv/aids (infant deaths per 1000)
# no general criterion found, so let's categorize based on box plot in EDA
bins = pd.IntervalIndex.from_tuples([(0.099, 5.15), (5.15, 20), (20, 50.6)])
df["hiv/aids_cat"] = pd.cut(df["hiv/aids"], bins=bins)
df["hiv/aids_cat"].isnull().sum()

0

In [82]:
# Categorize schooling (number of years of schooling)
# source: https://en.wikipedia.org/wiki/Educational_stage
bins = pd.IntervalIndex.from_tuples([(-1, 3), (3, 5), (5, 12), (12, 19), (19, 21)])
df["schooling_cat"] = pd.cut(df["schooling"], bins=bins)
df["schooling_cat"].isnull().sum()

163

In [83]:
# Drop the records with missing values in the two columns
df_dropped = df[df["income_comp_of_resources"].isnull() & df["schooling"].isnull()]
df.dropna(subset=["income_comp_of_resources", "schooling"], inplace=True)
df[["income_comp_of_resources", "schooling"]].isnull().sum()

income_comp_of_resources    0
schooling                   0
dtype: int64

In [84]:
# Show the number of instances dropped
df_dropped.shape[0]

163

**Note:** Since there are just 163 rows dropped (which will not be stratified), which is around 5.5% of data, there will be little harm to the distribution of stratum if I add them back after stratified sampling.

In [85]:
# Create a test set
from sklearn.model_selection import train_test_split

pre_train_set, pre_test_set = train_test_split(df, test_size=0.25, random_state=42,
                                       stratify=df[["hiv/aids_cat", "income_comp_of_resources_cat", "schooling_cat", "status"]])
test_set.head()

Unnamed: 0,country,year,status,life_expectancy,adult_mortality,infant_deaths,alcohol,percentage_exp,hepatitisb,measles,...,polio,tot_exp,diphtheria,hiv/aids,gdp,population,thinness_1to19_years,thinness_5to9_years,income_comp_of_resources,schooling
1753,Morocco,2010,Developing,72.8,11.0,20,0.56,183.659219,98.0,633,...,99.0,5.86,99.0,0.1,2834.2472,3249639.0,6.3,6.2,0.603,10.7
1169,Hungary,2000,Developed,71.7,193.0,1,12.22,75.362514,,1,...,99.0,7.6,99.0,0.1,4623.46712,121971.0,2.3,2.3,0.761,13.9
974,Gambia,2003,Developing,57.0,297.0,3,2.47,0.0,94.0,119,...,87.0,4.22,87.0,2.7,,,9.6,9.6,0.395,7.3
151,Azerbaijan,2008,Developing,73.0,141.0,6,1.18,206.261613,68.0,5,...,85.0,4.37,81.0,0.1,5574.6382,87634.0,2.8,2.9,0.719,11.6
1465,Lebanon,2008,Developing,74.5,98.0,1,1.67,69.139249,81.0,24,...,75.0,8.7,81.0,0.1,712.775759,411147.0,4.7,4.7,0.74,13.1


In [86]:
# Re-add the dropped rows
train_drop_set, test_drop_set = train_test_split(df_dropped, test_size=0.25, random_state=42)
train_set = pd.concat([pre_train_set, train_drop_set])
test_set = pd.concat([pre_test_set, test_drop_set])

# shuffle the sets
from sklearn.utils import shuffle
train_set = shuffle(train_set, random_state=42)
test_set = shuffle(test_set, random_state=42)

test_set.head() # oops! forget to drop categorized columns

Unnamed: 0,country,year,status,life_expectancy,adult_mortality,infant_deaths,alcohol,percentage_exp,hepatitisb,measles,...,hiv/aids,gdp,population,thinness_1to19_years,thinness_5to9_years,income_comp_of_resources,schooling,income_comp_of_resources_cat,hiv/aids_cat,schooling_cat
1753,Morocco,2010,Developing,72.8,11.0,20,0.56,183.659219,98.0,633,...,0.1,2834.2472,3249639.0,6.3,6.2,0.603,10.7,"(0.549, 0.699]","(0.099, 5.15]","(5, 12]"
1169,Hungary,2000,Developed,71.7,193.0,1,12.22,75.362514,,1,...,0.1,4623.46712,121971.0,2.3,2.3,0.761,13.9,"(0.699, 0.799]","(0.099, 5.15]","(12, 19]"
974,Gambia,2003,Developing,57.0,297.0,3,2.47,0.0,94.0,119,...,2.7,,,9.6,9.6,0.395,7.3,"(-0.009, 0.549]","(0.099, 5.15]","(5, 12]"
151,Azerbaijan,2008,Developing,73.0,141.0,6,1.18,206.261613,68.0,5,...,0.1,5574.6382,87634.0,2.8,2.9,0.719,11.6,"(0.699, 0.799]","(0.099, 5.15]","(5, 12]"
1465,Lebanon,2008,Developing,74.5,98.0,1,1.67,69.139249,81.0,24,...,0.1,712.775759,411147.0,4.7,4.7,0.74,13.1,"(0.699, 0.799]","(0.099, 5.15]","(12, 19]"


In [87]:
# Drop categorized columns
train_set.drop(columns=["income_comp_of_resources_cat", "schooling_cat", "hiv/aids_cat"], inplace=True)
test_set.drop(columns=["income_comp_of_resources_cat", "schooling_cat", "hiv/aids_cat"], inplace=True)
test_set.head()

Unnamed: 0,country,year,status,life_expectancy,adult_mortality,infant_deaths,alcohol,percentage_exp,hepatitisb,measles,...,polio,tot_exp,diphtheria,hiv/aids,gdp,population,thinness_1to19_years,thinness_5to9_years,income_comp_of_resources,schooling
1753,Morocco,2010,Developing,72.8,11.0,20,0.56,183.659219,98.0,633,...,99.0,5.86,99.0,0.1,2834.2472,3249639.0,6.3,6.2,0.603,10.7
1169,Hungary,2000,Developed,71.7,193.0,1,12.22,75.362514,,1,...,99.0,7.6,99.0,0.1,4623.46712,121971.0,2.3,2.3,0.761,13.9
974,Gambia,2003,Developing,57.0,297.0,3,2.47,0.0,94.0,119,...,87.0,4.22,87.0,2.7,,,9.6,9.6,0.395,7.3
151,Azerbaijan,2008,Developing,73.0,141.0,6,1.18,206.261613,68.0,5,...,85.0,4.37,81.0,0.1,5574.6382,87634.0,2.8,2.9,0.719,11.6
1465,Lebanon,2008,Developing,74.5,98.0,1,1.67,69.139249,81.0,24,...,75.0,8.7,81.0,0.1,712.775759,411147.0,4.7,4.7,0.74,13.1


In [91]:
# Check whether the proportions of status on the whole data and test set are close
def status_proportion(df):
    return df["status"].value_counts() / len(df)

compare_props = pd.DataFrame({
    "Overall": status_proportion(df_origin),
    "Stratified": status_proportion(test_set)
}).sort_index()

compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100
compare_props # good! :>

Unnamed: 0,Overall,Stratified,Strat. %error
Developed,0.174268,0.175749,0.849902
Developing,0.825732,0.824251,-0.179369


**Note:** Forget the `test_set` from now on! Bring it back till we find a good model.

# 3. Preprocessing