# MACHINE LEARING
A notebook to implement machine learning techniques.

# 1. Setup

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Disable warnings
import warnings
warnings.filterwarnings('ignore')

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import pandas as pd
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
import seaborn as sns

In [2]:
# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "02_machine_learning"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [3]:
# Where to save the models
import joblib

PROJECT_ROOT_DIR = "."
CHAPTER_ID = "02_machine_learning"
MODELS_PATH = os.path.join(PROJECT_ROOT_DIR, "models", CHAPTER_ID)
os.makedirs(MODELS_PATH, exist_ok=True)

def save_model(model, model_id):
    '''To save model in the corresponding directory'''
    joblib.dump(model, os.path.join(MODELS_PATH, model_id + "." + "pkl"))

def load_model(model_id):
    '''To load model from the corresponding directory'''
    return joblib.load(os.path.join(MODELS_PATH, model_id + "." + "pkl"))

**Note:** I will jump directly to the main dishes (pipelines and metrics) since the data cleaning and EDA have been caried already.

# 2. Create a Test Set

## 2.1. Load the Data

In [4]:
# Load the dataset
df = pd.read_csv("./dataset/life_expectancy.csv")
df.head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [5]:
# Rename some columns as their names contain trailing spaces
df.rename(columns={" BMI ":"BMI","Life expectancy ":"Life_Expectancy","Adult Mortality":"Adult_Mortality",
                   "infant deaths":"Infant_Deaths","percentage expenditure":"Percentage_Exp","Hepatitis B":"HepatitisB",
                  "Measles ":"Measles"," BMI ":"BMI","under-five deaths ":"Under_Five_Deaths","Diphtheria ":"Diphtheria",
                  " HIV/AIDS":"HIV/AIDS"," thinness  1-19 years":"thinness_1to19_years"," thinness 5-9 years":"thinness_5to9_years","Income composition of resources":"Income_Comp_Of_Resources",
                   "Total expenditure":"Tot_Exp"},inplace=True)
df.columns = [alias.lower() for alias in df.columns]
df.columns

Index(['country', 'year', 'status', 'life_expectancy', 'adult_mortality',
       'infant_deaths', 'alcohol', 'percentage_exp', 'hepatitisb', 'measles',
       'bmi', 'under_five_deaths', 'polio', 'tot_exp', 'diphtheria',
       'hiv/aids', 'gdp', 'population', 'thinness_1to19_years',
       'thinness_5to9_years', 'income_comp_of_resources', 'schooling'],
      dtype='object')

In [6]:
# Drop the instances without label (life_expectancy)
df.dropna(subset=["life_expectancy"], axis=0, inplace=True)
df.life_expectancy.isnull().sum()

0

## 2.2. Create a Test Set

**Note:** The training set and test set will preserve as much as possible the distribution of features `hiv/aids`, `income_comp_of_resources`, `schooling` and `status` as they strongly correlare with `life_expectancy` [EDA].

In [7]:
# Check the number of null values in these features
df[["hiv/aids", "income_comp_of_resources", "schooling", "status"]].isnull().sum()

hiv/aids                      0
income_comp_of_resources    160
schooling                   160
status                        0
dtype: int64

In [8]:
# Save the original dataset
df_origin = df.copy()

In [9]:
# Categorize income composition of resources (HDI)
# source: https://en.wikipedia.org/wiki/Human_Development_Index
bins = pd.IntervalIndex.from_tuples([(-0.009, 0.549), (0.549, 0.699), (0.699, 0.799), (0.799, 1)])
df["income_comp_of_resources_cat"] = pd.cut(df["income_comp_of_resources"], bins=bins)
df["income_comp_of_resources_cat"].isnull().sum()

160

In [10]:
# Categorize hiv/aids (infant deaths per 1000)
# no general criterion found, so let's categorize based on box plot in EDA
bins = pd.IntervalIndex.from_tuples([(0.099, 5.15), (5.15, 20), (20, 50.6)])
df["hiv/aids_cat"] = pd.cut(df["hiv/aids"], bins=bins)
df["hiv/aids_cat"].isnull().sum()

0

In [11]:
# Categorize schooling (number of years of schooling)
# source: https://en.wikipedia.org/wiki/Educational_stage
bins = pd.IntervalIndex.from_tuples([(-1, 3), (3, 5), (5, 12), (12, 19), (19, 21)])
df["schooling_cat"] = pd.cut(df["schooling"], bins=bins)
df["schooling_cat"].isnull().sum()

160

In [12]:
# Drop the records with missing values in the two columns
df_dropped = df[df["income_comp_of_resources"].isnull() | df["schooling"].isnull()]
df.dropna(subset=["income_comp_of_resources", "schooling"], inplace=True)
df[["income_comp_of_resources", "schooling"]].isnull().sum()

income_comp_of_resources    0
schooling                   0
dtype: int64

In [13]:
# Show the number of instances dropped
df_dropped.shape[0]

160

**Note:** Since there are just 160 rows dropped (which will not be stratified), which is around 5.5% of data, there will be little harm to the distribution of stratum if I add them back after stratified sampling.

In [14]:
# Create a test set
from sklearn.model_selection import train_test_split

pre_train_set, pre_test_set = train_test_split(df, test_size=0.25, random_state=42,
                                       stratify=df[["hiv/aids_cat", "income_comp_of_resources_cat", "schooling_cat", "status"]])
pre_test_set.head()

Unnamed: 0,country,year,status,life_expectancy,adult_mortality,infant_deaths,alcohol,percentage_exp,hepatitisb,measles,...,hiv/aids,gdp,population,thinness_1to19_years,thinness_5to9_years,income_comp_of_resources,schooling,income_comp_of_resources_cat,hiv/aids_cat,schooling_cat
2456,Sri Lanka,2000,Developing,71.5,175.0,5,1.45,60.490981,,16527,...,0.1,875.412178,18655.0,15.3,15.5,0.677,12.4,"(0.549, 0.699]","(0.099, 5.15]","(12, 19]"
486,Cameroon,2009,Developing,54.8,373.0,54,5.89,9.042541,8.0,251,...,6.3,123.19538,19432541.0,6.3,6.3,0.473,9.2,"(-0.009, 0.549]","(5.15, 20.0]","(5, 12]"
1518,Libya,2003,Developing,71.3,144.0,3,0.01,295.116651,96.0,0,...,0.1,4676.96753,,5.6,5.4,0.74,16.0,"(0.699, 0.799]","(0.099, 5.15]","(12, 19]"
1952,Pakistan,2005,Developing,62.9,2.0,364,0.04,30.593208,7.0,2981,...,0.1,711.469946,15399667.0,21.2,21.7,0.487,6.1,"(-0.009, 0.549]","(0.099, 5.15]","(5, 12]"
65,Antigua and Barbuda,2014,Developing,76.2,131.0,0,8.56,2422.999774,99.0,0,...,0.2,12888.29667,,3.3,3.3,0.782,13.9,"(0.699, 0.799]","(0.099, 5.15]","(12, 19]"


In [15]:
# Re-add the dropped rows
train_drop_set, test_drop_set = train_test_split(df_dropped, test_size=0.25, random_state=42)
train_set = pd.concat([pre_train_set, train_drop_set])
test_set = pd.concat([pre_test_set, test_drop_set])

# shuffle the sets
from sklearn.utils import shuffle
train_set = shuffle(train_set, random_state=42)
test_set = shuffle(test_set, random_state=42)

test_set.head() # oops! forget to drop categorized columns

Unnamed: 0,country,year,status,life_expectancy,adult_mortality,infant_deaths,alcohol,percentage_exp,hepatitisb,measles,...,hiv/aids,gdp,population,thinness_1to19_years,thinness_5to9_years,income_comp_of_resources,schooling,income_comp_of_resources_cat,hiv/aids_cat,schooling_cat
2868,Venezuela (Bolivarian Republic of),2005,Developing,73.6,158.0,9,7.92,0.0,88.0,0,...,0.1,,,1.7,1.6,0.7,11.8,"(0.699, 0.799]","(0.099, 5.15]","(5, 12]"
1146,Honduras,2007,Developing,73.0,16.0,5,3.16,222.482334,93.0,0,...,0.7,1592.572182,777972.0,2.4,2.3,0.59,10.9,"(0.549, 0.699]","(0.099, 5.15]","(5, 12]"
2754,United Arab Emirates,2007,Developing,75.6,87.0,1,1.69,3759.457226,92.0,0,...,0.1,42672.61323,,5.1,4.9,0.826,12.9,"(0.799, 1.0]","(0.099, 5.15]","(12, 19]"
2431,Spain,2009,Developed,81.6,66.0,2,9.99,5047.254058,96.0,41,...,0.1,32333.4661,46362946.0,0.6,0.5,0.858,16.3,"(0.799, 1.0]","(0.099, 5.15]","(12, 19]"
2165,Rwanda,2001,Developing,48.6,438.0,33,5.72,0.388254,,896,...,8.1,21.569654,832946.0,7.4,7.5,0.332,7.1,"(-0.009, 0.549]","(5.15, 20.0]","(5, 12]"


In [16]:
# Drop categorized columns
train_set.drop(columns=["income_comp_of_resources_cat", "schooling_cat", "hiv/aids_cat"], inplace=True)
test_set.drop(columns=["income_comp_of_resources_cat", "schooling_cat", "hiv/aids_cat"], inplace=True)
test_set.head()

Unnamed: 0,country,year,status,life_expectancy,adult_mortality,infant_deaths,alcohol,percentage_exp,hepatitisb,measles,...,polio,tot_exp,diphtheria,hiv/aids,gdp,population,thinness_1to19_years,thinness_5to9_years,income_comp_of_resources,schooling
2868,Venezuela (Bolivarian Republic of),2005,Developing,73.6,158.0,9,7.92,0.0,88.0,0,...,8.0,4.69,87.0,0.1,,,1.7,1.6,0.7,11.8
1146,Honduras,2007,Developing,73.0,16.0,5,3.16,222.482334,93.0,0,...,94.0,7.89,94.0,0.7,1592.572182,777972.0,2.4,2.3,0.59,10.9
2754,United Arab Emirates,2007,Developing,75.6,87.0,1,1.69,3759.457226,92.0,0,...,94.0,2.57,92.0,0.1,42672.61323,,5.1,4.9,0.826,12.9
2431,Spain,2009,Developed,81.6,66.0,2,9.99,5047.254058,96.0,41,...,96.0,9.52,96.0,0.1,32333.4661,46362946.0,0.6,0.5,0.858,16.3
2165,Rwanda,2001,Developing,48.6,438.0,33,5.72,0.388254,,896,...,76.0,4.38,77.0,8.1,21.569654,832946.0,7.4,7.5,0.332,7.1


In [17]:
# Check whether the proportions of status on the whole data and test set are close
def status_proportion(df):
    return df["status"].value_counts() / len(df)

compare_props = pd.DataFrame({
    "Overall": status_proportion(df_origin),
    "Stratified": status_proportion(test_set)
}).sort_index()

compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100
compare_props # good! :>

Unnamed: 0,Overall,Stratified,Strat. %error
Developed,0.174863,0.173497,-0.78125
Developing,0.825137,0.826503,0.165563


**Note:** Forget the `test_set` from now on! Bring it back till we find a good model.

# 3. Preprocessing

**Note:** Two categorical columns, `country`, `year` and `status`, have indices which are 0, 1 and 2. This is for the later work with `numpy` if needed.

**Suggestion:** May need to customize some transformers!

## 3.1. Imputer

In [18]:
# Build a transformer class - interpolation imputer (by country)
from sklearn.base import BaseEstimator, TransformerMixin

class GroupInterpImputer(BaseEstimator, TransformerMixin):
    '''Transformer that treats NaN of a group using linear interpolation method
    '''
    
    def __init__(self, strategy="country"):
        self.strategy = strategy
    
    def fit(self, X): 
        return self # nothing to do
    
    # NOTICE: SET ON THE TOP OF PIPELINE
    def transform(self, X): # assume the input is a pd.DataFrame
        df = X.copy()
        num_cols = df.drop(["status", "country"], axis=1).columns.tolist()
        for country in df.country.unique().tolist():
            df.loc[df[self.strategy]==country, num_cols] = df.loc[df[self.strategy]==country, num_cols].sort_values(by=["year"]).interpolate(axis=1)
        return df

In [19]:
# Test the customized imputer
interp_imputer = GroupInterpImputer()
experiment_1 = train_set.copy()
interp_imputer.transform(experiment_df).isnull().sum() # pd.DataFrame

country                     0
year                        0
status                      0
life_expectancy             0
adult_mortality             0
infant_deaths               0
alcohol                     0
percentage_exp              0
hepatitisb                  0
measles                     0
bmi                         0
under_five_deaths           0
polio                       0
tot_exp                     0
diphtheria                  0
hiv/aids                    0
gdp                         0
population                  0
thinness_1to19_years        0
thinness_5to9_years         0
income_comp_of_resources    0
schooling                   0
dtype: int64

**Note:** The pipeline flow is:

imputer -> feature engineer (optional) -> scaler/one-hot -> outlier detector -> clusterer (optional) -> model

## 3.2. Feature Engineering

**Note:** This section can be a large workload. However, right now just transform `country` to some relevant features or drop it.

### 3.2.1. Country to Continent

In [20]:
# Load the dataset
country_mapping = pd.read_csv("./dataset/country_mapping.csv")
country_mapping.head()

Unnamed: 0,name,alpha-2,alpha-3,country-code,iso_3166-2,region,sub-region,intermediate-region,region-code,sub-region-code,intermediate-region-code
0,Afghanistan,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,,142.0,34.0,
1,Åland Islands,AX,ALA,248,ISO 3166-2:AX,Europe,Northern Europe,,150.0,154.0,
2,Albania,AL,ALB,8,ISO 3166-2:AL,Europe,Southern Europe,,150.0,39.0,
3,Algeria,DZ,DZA,12,ISO 3166-2:DZ,Africa,Northern Africa,,2.0,15.0,
4,American Samoa,AS,ASM,16,ISO 3166-2:AS,Oceania,Polynesia,,9.0,61.0,


In [26]:
# Inspect the dataset
country_mapping.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249 entries, 0 to 248
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   name                      249 non-null    object 
 1   alpha-2                   248 non-null    object 
 2   alpha-3                   249 non-null    object 
 3   country-code              249 non-null    int64  
 4   iso_3166-2                249 non-null    object 
 5   region                    248 non-null    object 
 6   sub-region                248 non-null    object 
 7   intermediate-region       107 non-null    object 
 8   region-code               248 non-null    float64
 9   sub-region-code           248 non-null    float64
 10  intermediate-region-code  107 non-null    float64
dtypes: float64(3), int64(1), object(7)
memory usage: 21.5+ KB


In [25]:
# Show the regions included
country_mapping.region.unique()

array(['Asia', 'Europe', 'Africa', 'Oceania', 'Americas', nan],
      dtype=object)

In [32]:
# Check the null in region
country_mapping[~country_mapping["region"].isin(['Asia', 'Europe', 'Africa', 'Oceania', 'Americas'])]

Unnamed: 0,name,alpha-2,alpha-3,country-code,iso_3166-2,region,sub-region,intermediate-region,region-code,sub-region-code,intermediate-region-code
8,Antarctica,AQ,ATA,10,ISO 3166-2:AQ,,,,,,


**Note:** Antarctica is a continent itself.

In [34]:
# Fill the region Antarctica
country_mapping.loc[~country_mapping["region"].isin(['Asia', 'Europe', 'Africa', 'Oceania', 'Americas']), "region"] = "Antarctica"

In [35]:
# Show the regions included
country_mapping.region.unique()

array(['Asia', 'Europe', 'Africa', 'Oceania', 'Americas', 'Antarctica'],
      dtype=object)

**Note:** We will transform the `country` into `continent`, which reduces the number of features of one-hot vector and also avoid the cases mentioned in EDA notebook.

In [37]:
# Change name of the column name
country_mapping.rename(columns={"name": "country"}, inplace=True)
country_mapping.head()

Unnamed: 0,country,alpha-2,alpha-3,country-code,iso_3166-2,region,sub-region,intermediate-region,region-code,sub-region-code,intermediate-region-code
0,Afghanistan,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,,142.0,34.0,
1,Åland Islands,AX,ALA,248,ISO 3166-2:AX,Europe,Northern Europe,,150.0,154.0,
2,Albania,AL,ALB,8,ISO 3166-2:AL,Europe,Southern Europe,,150.0,39.0,
3,Algeria,DZ,DZA,12,ISO 3166-2:DZ,Africa,Northern Africa,,2.0,15.0,
4,American Samoa,AS,ASM,16,ISO 3166-2:AS,Oceania,Polynesia,,9.0,61.0,


In [44]:
# Transform country to continent
country2continent = dict()
for country in country_mapping['country']:
    country2continent[country] = country_mapping[country_mapping['country']==country]['region'].values[0]

update = {'Republic of Korea' : 'Asia', 'Republic of Moldova' : 'Europe' ,
          'Democratic Republic of the Congo' : 'Africa', 'Czechia' : 'Europe', 
          'United Republic of Tanzania' : 'Africa', 'The former Yugoslav republic of Macedonia' : 'Europe', 
          "Democratic People's Republic of Korea" : 'Asia'}

country2continent.update(update)

experiment_2 = train_set.copy()
experiment_2['continent'] = experiment_2['country'].copy().replace(country2continent)
experiment_2.continent.isna().sum()

0

In [45]:
# Inspect the result
experiment_2.head()

Unnamed: 0,country,year,status,life_expectancy,adult_mortality,infant_deaths,alcohol,percentage_exp,hepatitisb,measles,...,tot_exp,diphtheria,hiv/aids,gdp,population,thinness_1to19_years,thinness_5to9_years,income_comp_of_resources,schooling,continent
2484,Suriname,2004,Developing,68.3,214.0,0,4.55,4.232577,,0,...,6.88,85.0,0.9,36.48773,49363.0,3.5,3.5,0.0,11.1,Americas
2241,Saudi Arabia,2007,Developing,73.2,99.0,9,0.1,1307.89002,96.0,4648,...,3.49,96.0,0.1,16472.1665,,7.2,7.3,0.773,12.7,Asia
1870,Nicaragua,2006,Developing,73.0,17.0,3,3.69,21.411235,88.0,0,...,6.33,88.0,0.3,124.992617,545211.0,2.0,1.9,0.597,11.0,Americas
2478,Suriname,2010,Developing,73.0,19.0,0,5.26,99.080954,86.0,0,...,5.81,96.0,0.5,833.31332,52613.0,3.5,3.4,0.7,12.3,Americas
2932,Zimbabwe,2005,Developing,44.6,717.0,28,4.14,8.717409,65.0,420,...,6.44,68.0,30.3,444.76575,129432.0,9.0,9.0,0.406,9.3,Africa
