# MACHINE LEARING
A notebook to implement machine learning techniques.

# 1. Setup

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Disable warnings
import warnings
warnings.filterwarnings('ignore')

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import pandas as pd
import numpy as np
import os
import copy
import json

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
import seaborn as sns

In [43]:
# Scikit-Learn modules in use

# prepare test set
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

# preprocessing
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import PolynomialFeatures
from sklearn.impute import SimpleImputer

# train and select models
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from sklearn.cluster import KMeans
from sklearn.linear_model import ElasticNet
from sklearn.base import clone
from sklearn.neighbors import KNeighborsClassifier

# cross-valiate model
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint, uniform

**Note:** I will jump directly to the main dishes (pipelines and metrics) since the data cleaning and EDA have been caried already.

# 2. Create a Test Set

## 2.1. Load the Data

In [158]:
# Load the dataset
df = pd.read_csv("./dataset/life_expectancy.csv")
df.head()

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [159]:
# Rename some columns as their names contain trailing spaces
df.rename(columns={" BMI ":"BMI","Life expectancy ":"Life_Expectancy","Adult Mortality":"Adult_Mortality",
                   "infant deaths":"Infant_Deaths","percentage expenditure":"Percentage_Exp","Hepatitis B":"Hepatitis_B",
                  "Measles ":"Measles"," BMI ":"BMI","under-five deaths ":"Under_Five_Deaths","Diphtheria ":"Diphtheria",
                  " HIV/AIDS":"HIV/AIDS"," thinness  1-19 years":"thinness_10_19_years"," thinness 5-9 years":"thinness_5_9_years",
                   "Income composition of resources":"income_composition_of_resources", "Total expenditure":"Total_expenditure", "percentage expenditure": "percentage_expenditure"},
          inplace=True)
df.columns = [alias.lower() for alias in df.columns]
df.columns

Index(['country', 'year', 'status', 'life_expectancy', 'adult_mortality',
       'infant_deaths', 'alcohol', 'percentage_expenditure', 'hepatitis_b',
       'measles', 'bmi', 'under_five_deaths', 'polio', 'total_expenditure',
       'diphtheria', 'hiv/aids', 'gdp', 'population', 'thinness_10_19_years',
       'thinness_5_9_years', 'income_composition_of_resources', 'schooling'],
      dtype='object')

In [160]:
# Drop the instances without label (life_expectancy)
df.dropna(subset=["life_expectancy"], axis=0, inplace=True)
df.life_expectancy.isnull().sum()

0

In [161]:
# Inspect the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2928 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   country                          2928 non-null   object 
 1   year                             2928 non-null   int64  
 2   status                           2928 non-null   object 
 3   life_expectancy                  2928 non-null   float64
 4   adult_mortality                  2928 non-null   float64
 5   infant_deaths                    2928 non-null   int64  
 6   alcohol                          2735 non-null   float64
 7   percentage_expenditure           2928 non-null   float64
 8   hepatitis_b                      2375 non-null   float64
 9   measles                          2928 non-null   int64  
 10  bmi                              2896 non-null   float64
 11  under_five_deaths                2928 non-null   int64  
 12  polio               

## 2.2. Create a Test Set

In [134]:
# Define the test size
test_size = 0.25

### 2.2.1. Random Sampling

In [109]:
# Create a Test Set
#train_set, test_set = train_test_split(df, test_size=test_size, random_state=42)

### 2.2.2. Stratified Sampling (Official)

**Note:** The training set and test set will preserve as much as possible the distribution of features `hiv/aids`, `income_comp_of_resources`, `schooling` and `status` as they strongly correlare with `life_expectancy` [EDA].

In [135]:
# Check the number of null values in these features
df[["hiv/aids", "income_composition_of_resources", "schooling", "status"]].isnull().sum()

hiv/aids                             0
income_composition_of_resources    160
schooling                          160
status                               0
dtype: int64

In [136]:
# Save the original dataset
df_origin = df.copy()

In [137]:
# Categorize income composition of resources (HDI)
# source: https://en.wikipedia.org/wiki/Human_Development_Index
bins = pd.IntervalIndex.from_tuples([(-0.009, 0.549), (0.549, 0.699), (0.699, 0.799), (0.799, 1)])
df["income_composition_of_resources_cat"] = pd.cut(df["income_composition_of_resources"], bins=bins)
df["income_composition_of_resources_cat"].isnull().sum()

160

In [138]:
# Categorize hiv/aids (infant deaths per 1000)
# no general criterion found, so let's categorize based on box plot in EDA
bins = pd.IntervalIndex.from_tuples([(0.099, 5.15), (5.15, 20), (20, 50.6)])
df["hiv/aids_cat"] = pd.cut(df["hiv/aids"], bins=bins)
df["hiv/aids_cat"].isnull().sum()

0

In [139]:
# Categorize schooling (number of years of schooling)
# source: https://en.wikipedia.org/wiki/Educational_stage
bins = pd.IntervalIndex.from_tuples([(-1, 3), (3, 5), (5, 12), (12, 19), (19, 21)])
df["schooling_cat"] = pd.cut(df["schooling"], bins=bins)
df["schooling_cat"].isnull().sum()

160

In [140]:
# Drop the records with missing values in the two columns
df_dropped = df[df["income_composition_of_resources"].isnull() | df["schooling"].isnull()]
df.dropna(subset=["income_composition_of_resources", "schooling"], inplace=True)
df[["income_composition_of_resources", "schooling"]].isnull().sum()

income_composition_of_resources    0
schooling                          0
dtype: int64

In [141]:
# Show the number of instances dropped
df_dropped.shape[0]

160

**Note:** Since there are just 160 rows dropped (which will not be stratified), which is around 5.5% of data, there will be little harm to the distribution of stratum if I add them back after stratified sampling.

In [142]:
# Create a test set
pre_train_set, pre_test_set = train_test_split(df, test_size=test_size, random_state=42,
                                       stratify=df[["hiv/aids_cat", "income_composition_of_resources_cat", "schooling_cat", "status"]])
pre_test_set.head()

Unnamed: 0,country,year,status,life_expectancy,adult_mortality,infant_deaths,alcohol,percentage_expenditure,hepatitis_b,measles,...,hiv/aids,gdp,population,thinness_10_19_years,thinness_5_9_years,income_composition_of_resources,schooling,income_composition_of_resources_cat,hiv/aids_cat,schooling_cat
2456,Sri Lanka,2000,Developing,71.5,175.0,5,1.45,60.490981,,16527,...,0.1,875.412178,18655.0,15.3,15.5,0.677,12.4,"(0.549, 0.699]","(0.099, 5.15]","(12, 19]"
486,Cameroon,2009,Developing,54.8,373.0,54,5.89,9.042541,8.0,251,...,6.3,123.19538,19432541.0,6.3,6.3,0.473,9.2,"(-0.009, 0.549]","(5.15, 20.0]","(5, 12]"
1518,Libya,2003,Developing,71.3,144.0,3,0.01,295.116651,96.0,0,...,0.1,4676.96753,,5.6,5.4,0.74,16.0,"(0.699, 0.799]","(0.099, 5.15]","(12, 19]"
1952,Pakistan,2005,Developing,62.9,2.0,364,0.04,30.593208,7.0,2981,...,0.1,711.469946,15399667.0,21.2,21.7,0.487,6.1,"(-0.009, 0.549]","(0.099, 5.15]","(5, 12]"
65,Antigua and Barbuda,2014,Developing,76.2,131.0,0,8.56,2422.999774,99.0,0,...,0.2,12888.29667,,3.3,3.3,0.782,13.9,"(0.699, 0.799]","(0.099, 5.15]","(12, 19]"


In [143]:
# Re-add the dropped rows
train_drop_set, test_drop_set = train_test_split(df_dropped, test_size=test_size, random_state=42)
train_set = pd.concat([pre_train_set, train_drop_set])
test_set = pd.concat([pre_test_set, test_drop_set])

# shuffle the sets
train_set = shuffle(train_set, random_state=42)
test_set = shuffle(test_set, random_state=42)

test_set.head() # oops! forget to drop categorized columns

Unnamed: 0,country,year,status,life_expectancy,adult_mortality,infant_deaths,alcohol,percentage_expenditure,hepatitis_b,measles,...,hiv/aids,gdp,population,thinness_10_19_years,thinness_5_9_years,income_composition_of_resources,schooling,income_composition_of_resources_cat,hiv/aids_cat,schooling_cat
2868,Venezuela (Bolivarian Republic of),2005,Developing,73.6,158.0,9,7.92,0.0,88.0,0,...,0.1,,,1.7,1.6,0.7,11.8,"(0.699, 0.799]","(0.099, 5.15]","(5, 12]"
1146,Honduras,2007,Developing,73.0,16.0,5,3.16,222.482334,93.0,0,...,0.7,1592.572182,777972.0,2.4,2.3,0.59,10.9,"(0.549, 0.699]","(0.099, 5.15]","(5, 12]"
2754,United Arab Emirates,2007,Developing,75.6,87.0,1,1.69,3759.457226,92.0,0,...,0.1,42672.61323,,5.1,4.9,0.826,12.9,"(0.799, 1.0]","(0.099, 5.15]","(12, 19]"
2431,Spain,2009,Developed,81.6,66.0,2,9.99,5047.254058,96.0,41,...,0.1,32333.4661,46362946.0,0.6,0.5,0.858,16.3,"(0.799, 1.0]","(0.099, 5.15]","(12, 19]"
2165,Rwanda,2001,Developing,48.6,438.0,33,5.72,0.388254,,896,...,8.1,21.569654,832946.0,7.4,7.5,0.332,7.1,"(-0.009, 0.549]","(5.15, 20.0]","(5, 12]"


In [144]:
# Drop categorized columns
train_set.drop(columns=["income_composition_of_resources_cat", "schooling_cat", "hiv/aids_cat"], inplace=True)
test_set.drop(columns=["income_composition_of_resources_cat", "schooling_cat", "hiv/aids_cat"], inplace=True)
test_set.head()

Unnamed: 0,country,year,status,life_expectancy,adult_mortality,infant_deaths,alcohol,percentage_expenditure,hepatitis_b,measles,...,polio,total_expenditure,diphtheria,hiv/aids,gdp,population,thinness_10_19_years,thinness_5_9_years,income_composition_of_resources,schooling
2868,Venezuela (Bolivarian Republic of),2005,Developing,73.6,158.0,9,7.92,0.0,88.0,0,...,8.0,4.69,87.0,0.1,,,1.7,1.6,0.7,11.8
1146,Honduras,2007,Developing,73.0,16.0,5,3.16,222.482334,93.0,0,...,94.0,7.89,94.0,0.7,1592.572182,777972.0,2.4,2.3,0.59,10.9
2754,United Arab Emirates,2007,Developing,75.6,87.0,1,1.69,3759.457226,92.0,0,...,94.0,2.57,92.0,0.1,42672.61323,,5.1,4.9,0.826,12.9
2431,Spain,2009,Developed,81.6,66.0,2,9.99,5047.254058,96.0,41,...,96.0,9.52,96.0,0.1,32333.4661,46362946.0,0.6,0.5,0.858,16.3
2165,Rwanda,2001,Developing,48.6,438.0,33,5.72,0.388254,,896,...,76.0,4.38,77.0,8.1,21.569654,832946.0,7.4,7.5,0.332,7.1


In [145]:
# Check whether the proportions of status on the whole data and test set are close
def status_proportion(df):
    return df["status"].value_counts() / len(df)

compare_props = pd.DataFrame({
    "Overall": status_proportion(df_origin),
    "Stratified": status_proportion(test_set)
}).sort_index()

compare_props["Strat. %error"] = 100 * compare_props["Stratified"] / compare_props["Overall"] - 100
compare_props # good! :>

Unnamed: 0,Overall,Stratified,Strat. %error
Developed,0.174863,0.173497,-0.78125
Developing,0.825137,0.826503,0.165563


# 3. Preprocessing

**Note:** Two categorical columns, `country`, `year` and `status`, have indices which are 0, 1 and 2. This is for the later work with `numpy` if needed.

In [146]:
# Separate features and labels
X = train_set.drop(columns=["life_expectancy"])
y = train_set["life_expectancy"].copy()

X_bar = test_set.drop(columns=["life_expectancy"])
y_bar = test_set["life_expectancy"].copy()

In [148]:
# Seperate numeric columns and categorical columns
num_attrbs = X.drop(["status", "country"], axis=1).columns.tolist()
cat_attrbs = ["status", "country"]
attrbs = X.columns.tolist()

**Suggestion:** May need to customize some transformers!

## 3.1. Imputer

**Note:** There are two ways to impute the missing values. The first method is now chosen in full pipeline.

### 3.1.1. Interpolation Imputer

In [149]:
# Build a transformer class - interpolation imputer (by country)
class GroupInterpImputer(BaseEstimator, TransformerMixin):
    '''Transformer that treats NaN of a group using linear interpolation method
    '''
    
    def __init__(self, groupby="country",):
        self.groupby = groupby
    
    def fit(self, X, y=None): 
        return self # nothing to do
    
    
    def transform(self, X): # pd.DataFrame
        df = copy.deepcopy(X)
        
        num_attrbs = df.drop(df.columns[[0, 2]], axis=1).columns.tolist()
        
        for country in df.country.unique().tolist():
            df.loc[df[self.groupby]==country, num_attrbs] = df.loc[df[self.groupby]==country, num_attrbs].sort_values(by=["year"]).interpolate(axis=1)
            
        return df

In [150]:
# Test the new transformer
interp_imputer = GroupInterpImputer()
imputer_tester = copy.deepcopy(train_set)
interp_imputer.transform(imputer_tester).isnull().sum() # pd.DataFrame

country                            0
year                               0
status                             0
life_expectancy                    0
adult_mortality                    0
infant_deaths                      0
alcohol                            0
percentage_expenditure             0
hepatitis_b                        0
measles                            0
bmi                                0
under_five_deaths                  0
polio                              0
total_expenditure                  0
diphtheria                         0
hiv/aids                           0
gdp                                0
population                         0
thinness_10_19_years               0
thinness_5_9_years                 0
income_composition_of_resources    0
schooling                          0
dtype: int64

### 3.1.2. Simple Imputers by an Atrribute (Optional)

In [151]:
class AttrbBasedImputer(BaseEstimator, TransformerMixin):
    '''Impute missing values for each attribute
    '''
    
    def __init__(self, attrib="country", strategy="mean"):
        self.attrib = attrib
        self.strategy = strategy
    
    def fit(self, X, y=None):
        df = copy.deepcopy(X)
        num_attrbs = df.drop(df.columns[[0, 2]], axis=1).columns.tolist()
        
        self.statistics_ = {}
        for group in df[self.attrib].unique().tolist():
            imp_group = SimpleImputer(strategy=self.strategy)
            imp_group.fit(df.loc[df[self.attrib]==group, num_attrbs])
            self.statistics_[group] = imp_group.statistics_
        
        return self
    
    def transform(self, X):
        df = copy.deepcopy(X)
        num_attrbs = df.drop(df.columns[[0, 2]], axis=1).columns.tolist() # base
        
        for group in df[self.attrib].unique().tolist():
            df.loc[X[self.attrib]==group, num_attrbs] = self.statistics_[group]
        
        return df

**Note:** The pipeline flow is:

imputer -> feature engineer (optional) -> scaler/one-hot -> outlier detector -> clusterer (optional) -> model

## 3.2. Feature Engineering

**Note:** This section can be a large workload. However, right now just transform `country` to some relevant features or drop it.

### 3.2.1. Country to Continent

In [152]:
# Load into JSON file for later usage
with open(".\dataset\continent.json", "r") as f:
    country_continent = json.load(f)

country_continent

{'Afghanistan': 'Asia',
 'Albania': 'Europe',
 'Algeria': 'Africa',
 'Angola': 'Africa',
 'Antigua and Barbuda': 'Americas',
 'Argentina': 'Americas',
 'Armenia': 'Asia',
 'Australia': 'Oceania',
 'Austria': 'Europe',
 'Azerbaijan': 'Asia',
 'Bahamas': 'Americas',
 'Bahrain': 'Asia',
 'Bangladesh': 'Asia',
 'Barbados': 'Americas',
 'Belarus': 'Europe',
 'Belgium': 'Europe',
 'Belize': 'Americas',
 'Benin': 'Africa',
 'Bhutan': 'Asia',
 'Bolivia (Plurinational State of)': 'Americas',
 'Bosnia and Herzegovina': 'Europe',
 'Botswana': 'Africa',
 'Brazil': 'Americas',
 'Brunei Darussalam': 'Asia',
 'Bulgaria': 'Europe',
 'Burkina Faso': 'Africa',
 'Burundi': 'Africa',
 "Côte d'Ivoire": 'Africa',
 'Cabo Verde': 'Africa',
 'Cambodia': 'Asia',
 'Cameroon': 'Africa',
 'Canada': 'Americas',
 'Central African Republic': 'Africa',
 'Chad': 'Africa',
 'Chile': 'Americas',
 'China': 'Asia',
 'Colombia': 'Americas',
 'Comoros': 'Africa',
 'Congo': 'Africa',
 'Costa Rica': 'Americas',
 'Croatia': 'Eu

In [153]:
# Build a transformer - convert country to continent
class ContinentConverter(BaseEstimator, TransformerMixin):
    '''Transformer that replaces country with continent
    '''
    
    def __init__(self, continent=country_continent):
        self.continent = continent
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X): # np.array
        df = copy.deepcopy(X)
        for i in df.index:
            df.loc[i, "continent"] = self.continent.get(df.loc[i, "country"], "Unknown")
        return df

In [154]:
# Test the new transformer
_pipeline1 = Pipeline([
        ("intern_imputer", GroupInterpImputer()),
        ("continent_converter", ContinentConverter())
    ])
X_transformed = _pipeline1.fit_transform(X)
X_transformed.isnull().sum() # no null

country                            0
year                               0
status                             0
adult_mortality                    0
infant_deaths                      0
alcohol                            0
percentage_expenditure             0
hepatitis_b                        0
measles                            0
bmi                                0
under_five_deaths                  0
polio                              0
total_expenditure                  0
diphtheria                         0
hiv/aids                           0
gdp                                0
population                         0
thinness_10_19_years               0
thinness_5_9_years                 0
income_composition_of_resources    0
schooling                          0
continent                          0
dtype: int64

## 3.3. Text Handling

In [155]:
# Get dummies of those categorical attributes
pd.get_dummies(X, columns=["status"]).head()

Unnamed: 0,country,year,adult_mortality,infant_deaths,alcohol,percentage_expenditure,hepatitis_b,measles,bmi,under_five_deaths,...,diphtheria,hiv/aids,gdp,population,thinness_10_19_years,thinness_5_9_years,income_composition_of_resources,schooling,status_Developed,status_Developing
2484,Suriname,2004,214.0,0,4.55,4.232577,,0,5.4,0,...,85.0,0.9,36.48773,49363.0,3.5,3.5,0.0,11.1,0,1
2241,Saudi Arabia,2007,99.0,9,0.1,1307.89002,96.0,4648,61.6,10,...,96.0,0.1,16472.1665,,7.2,7.3,0.773,12.7,0,1
1870,Nicaragua,2006,17.0,3,3.69,21.411235,88.0,0,47.1,4,...,88.0,0.3,124.992617,545211.0,2.0,1.9,0.597,11.0,0,1
2478,Suriname,2010,19.0,0,5.26,99.080954,86.0,0,54.8,0,...,96.0,0.5,833.31332,52613.0,3.5,3.4,0.7,12.3,0,1
2932,Zimbabwe,2005,717.0,28,4.14,8.717409,65.0,420,27.5,43,...,68.0,30.3,444.76575,129432.0,9.0,9.0,0.406,9.3,0,1


## 3.4. Full Pipeline

In [156]:
# pre-processing pipeline for training
preprocessor = Pipeline([
        ("intern_imputer", GroupInterpImputer()),
        ("continent_converter", ContinentConverter()),
    ])

# pre-processing pipeline for testing
test_preprocessor = Pipeline([
        ("intern_imputer", GroupInterpImputer()),
        ("continent_converter", ContinentConverter()),
    ])

# Remember to get dummies!

In [162]:
# Get preprocessed whole data (rerun 2.1 first)
df = preprocessor.fit_transform(df)
df = pd.get_dummies(df, columns=["status", "continent"])

df.shape

(2928, 28)

In [163]:
# Get preprocessed data for training
X_train = preprocessor.fit_transform(X)
X_train = pd.get_dummies(X_train, columns=["status", "continent"])
y_train = y

X_train.shape

(2196, 27)

In [164]:
# Get preprocessed data for testing
X_test = preprocessor.fit_transform(X_bar)
X_test = pd.get_dummies(X_test, columns=["status", "continent"])
y_test = y_bar

X_test.shape

(732, 27)

In [165]:
# Check if there is null in training set
X_train.isnull().sum()

country                            0
year                               0
adult_mortality                    0
infant_deaths                      0
alcohol                            0
percentage_expenditure             0
hepatitis_b                        0
measles                            0
bmi                                0
under_five_deaths                  0
polio                              0
total_expenditure                  0
diphtheria                         0
hiv/aids                           0
gdp                                0
population                         0
thinness_10_19_years               0
thinness_5_9_years                 0
income_composition_of_resources    0
schooling                          0
status_Developed                   0
status_Developing                  0
continent_Africa                   0
continent_Americas                 0
continent_Asia                     0
continent_Europe                   0
continent_Oceania                  0
d

In [166]:
# Check if there is null in test set
X_test.isnull().sum()

country                            0
year                               0
adult_mortality                    0
infant_deaths                      0
alcohol                            0
percentage_expenditure             0
hepatitis_b                        0
measles                            0
bmi                                0
under_five_deaths                  0
polio                              0
total_expenditure                  0
diphtheria                         0
hiv/aids                           0
gdp                                0
population                         0
thinness_10_19_years               0
thinness_5_9_years                 0
income_composition_of_resources    0
schooling                          0
status_Developed                   0
status_Developing                  0
continent_Africa                   0
continent_Americas                 0
continent_Asia                     0
continent_Europe                   0
continent_Oceania                  0
d

# 4. Save the Data

In [167]:
# Save the preprocessed data for machine learning phase
df.to_csv('./dataset/life_expectancy_prep.csv', encoding = 'utf-8-sig', index=False)

X_train.to_csv('./dataset/X_train.csv', encoding = 'utf-8-sig', index=False) 
X_test.to_csv('./dataset/X_test.csv', encoding = 'utf-8-sig', index=False) 

y_train.to_csv('./dataset/y_train.csv', encoding = 'utf-8-sig', index=False) 
y_test.to_csv('./dataset/y_test.csv', encoding = 'utf-8-sig', index=False) 