# MACHINE LEARING
A notebook to implement machine learning techniques.

# 1. Setup

In [15]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Disable warnings
import warnings
warnings.filterwarnings('ignore')

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import pandas as pd
import numpy as np
import os
import copy
import json

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
import seaborn as sns

In [16]:
# Scikit-Learn modules in use

# prepare test set
from sklearn.model_selection import train_test_split

# preprocessing
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# train and select models
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

from sklearn.cluster import KMeans
from sklearn.linear_model import ElasticNet
from sklearn.base import clone
from sklearn.neighbors import KNeighborsClassifier

# metrics
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error 

# cross-valiate model
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint, uniform

**Note:** I will jump directly to the main dishes (pipelines and metrics) since the data cleaning and EDA have been caried already.

# 2. Load the Data

In [17]:
# Load the preprocessed data
X_train = pd.read_csv('../dataset/X_train.csv').drop(columns=['country'])
y_train = pd.read_csv('../dataset/y_train.csv')

X_test = pd.read_csv('../dataset/X_test.csv').drop(columns=['country'])
y_test = pd.read_csv('../dataset/y_test.csv')

In [18]:
# Inspect the training set
X_train.head()

Unnamed: 0,year,adult_mortality,infant_deaths,alcohol,percentage_expenditure,hepatitis_b,measles,bmi,under_five_deaths,polio,...,thinness_5_9_years,income_composition_of_resources,schooling,status_Developed,status_Developing,continent_Africa,continent_Americas,continent_Asia,continent_Europe,continent_Oceania
0,2004,214.0,0,4.55,4.232577,2.116288,0,5.4,0,84.0,...,3.5,0.0,11.1,0,1,0,1,0,0,0
1,2007,99.0,9,0.1,1307.89002,96.0,4648,61.6,10,96.0,...,7.3,0.773,12.7,0,1,0,0,1,0,0
2,2006,17.0,3,3.69,21.411235,88.0,0,47.1,4,88.0,...,1.9,0.597,11.0,0,1,0,1,0,0,0
3,2010,19.0,0,5.26,99.080954,86.0,0,54.8,0,96.0,...,3.4,0.7,12.3,0,1,0,1,0,0,0
4,2005,717.0,28,4.14,8.717409,65.0,420,27.5,43,69.0,...,9.0,0.406,9.3,0,1,1,0,0,0,0


In [19]:
# Columns of the data
X_train.columns

Index(['year', 'adult_mortality', 'infant_deaths', 'alcohol',
       'percentage_expenditure', 'hepatitis_b', 'measles', 'bmi',
       'under_five_deaths', 'polio', 'total_expenditure', 'diphtheria',
       'hiv/aids', 'gdp', 'population', 'thinness_10_19_years',
       'thinness_5_9_years', 'income_composition_of_resources', 'schooling',
       'status_Developed', 'status_Developing', 'continent_Africa',
       'continent_Americas', 'continent_Asia', 'continent_Europe',
       'continent_Oceania'],
      dtype='object')

# 3. Preprocessing

**Note:** The preprocessing (continent converter & dummy attributes) has been held. However, since the method includes clustering (especially by K-Means) & regularized linear regression, scaling will be a good practice.

In [20]:
# Scale the data
num_attrbs = ['year', 'adult_mortality', 'infant_deaths', 'alcohol',
       'percentage_expenditure', 'hepatitis_b', 'measles', 'bmi',
       'under_five_deaths', 'polio', 'total_expenditure', 'diphtheria',
       'hiv/aids', 'gdp', 'population', 'thinness_10_19_years',
       'thinness_5_9_years', 'income_composition_of_resources', 'schooling']

scaler = ColumnTransformer([
    ("standard_scaler", StandardScaler(), num_attrbs),
], remainder="passthrough")

In [21]:
# Full pipeline
full_pipeline = Pipeline([
    ("stad_scaler", scaler)
])

X_prep = full_pipeline.fit_transform(X_train)
X_prep

array([[-0.76476795,  0.40158706, -0.25556559, ...,  0.        ,
         0.        ,  0.        ],
       [-0.11180122, -0.53186527, -0.18124577, ...,  1.        ,
         0.        ,  0.        ],
       [-0.3294568 , -1.19745737, -0.23079232, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 1.4117878 , -0.1016655 , -0.18950353, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.5411655 , -1.03511784,  0.18209561, ...,  0.        ,
         0.        ,  0.        ],
       [-0.76476795, -0.71043876, -0.23905008, ...,  0.        ,
         0.        ,  0.        ]])

In [22]:
# Preprocess the variable of interest
y_prep = y_train # nothing to do

# 4. Select and Train Models 

**Note:** In the first approach, we only use linear-relevant models such as SVM, Linear Regression and so on.

In [23]:
# Show RMSE of a model
def root_mean_squared_error(y, y_hat):
    mse = mean_squared_error(y, y_hat)
    rmse = np.sqrt(mse)
    return rmse

In [24]:
# Show the score of cross validation
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

## 4.1. Linear Regression

In [11]:
# Train a linear regressor
lin_reg = LinearRegression()
lin_reg.fit(X_prep, y_prep)

LinearRegression()

In [12]:
# Evaluate the training
root_mean_squared_error(y_prep, lin_reg.predict(X_prep))

4.025442305433962

In [13]:
# K-fold cross validation on the model
scores = np.sqrt(-cross_val_score(lin_reg, X_prep, y_prep, scoring="neg_mean_squared_error", cv=10))
display_scores(scores)

Scores: [4.10500811 3.81847688 3.4580502  4.40698856 3.98350413 4.29550662
 3.70116329 4.23798102 4.32449396 4.27712182]
Mean: 4.060829459402693
Standard deviation: 0.29697673964948745


## 4.2. Clustered Linear Regression

In [13]:
# Build a model of clustered Linear Regressions on clusters of K-Means
class ClusteredLinearRegression(BaseEstimator, TransformerMixin):
    
    def __init__(self, 
                 clusterer=KMeans(n_clusters=4, random_state=42), 
                 estimator=ElasticNet(alpha=0), 
                 classifier=None):
        
        self.clusterer = clusterer
        self.estimator = estimator
        self.classifier = classifier # if the clusterer cannot predict for the new instances
       
    def fit(self, X, y=None):
        # cluster the data
        self.clusterer.fit(X)
        
        # fit the data of each cluster using a elastic net
        lin_regs = {}
        clusters = np.unique(self.clusterer.labels_)
        for cluster in clusters:
            lin_reg = clone(self.estimator)
            lin_reg.fit(X[self.clusterer.labels_ == cluster], y[self.clusterer.labels_ == cluster])
            lin_regs[cluster] = lin_reg
            
        self.lin_regs_ = lin_regs
        
        # train classifier to classify the new instances (if needed)
        if self.classifier != None:
            self.classifier.fit(X, self.clusterer.labels_)
        
        return self.lin_regs_
    
    def transform(self, X):
        return self.clusterer.transform(X)
    
    def predict(self, X):
        y_hat = np.zeros(X.shape[0])
        
        # predict the cluster of new instances
        if self.classifier != None:
            clusters_pred = self.classifier.predict(X)
        else:
            clusters_pred = self.clusterer.predict(X)
        
        # predict the labels of new instances based on clusters
        for cluster in self.lin_regs_.keys():
            if X[clusters_pred == cluster].shape[0] > 0:
                y_hat_cluster = self.lin_regs_[cluster].predict(X[clusters_pred == cluster])
                y_hat[clusters_pred == cluster] = y_hat_cluster
        
        return y_hat

### 4.2.1. KMeans and Elastic Net

**Note:** The linear-based strategy is to use KMeans as the clusterer and ElasticNet (LR) as the estimator.

In [22]:
# Train a clusterd linear regressor
lin_regs = ClusteredLinearRegression(clusterer=KMeans(n_clusters=4, random_state=42), 
                                     estimator=ElasticNet())
lin_regs.fit(X_prep, y_prep)

{0: ElasticNet(), 1: ElasticNet(), 2: ElasticNet(), 3: ElasticNet()}

In [23]:
# Evaluate the training
root_mean_squared_error(y_prep, lin_regs.predict(X_prep))

4.938312264604737

In [24]:
# K-fold cross validation on the model
scores = np.sqrt(-cross_val_score(lin_regs, X_prep, y_prep, scoring="neg_mean_squared_error", 
                                  cv=10, error_score="raise"))
display_scores(scores)

Scores: [5.02244247 4.80598226 4.1638663  4.86888403 5.24965875 5.37413944
 4.67774444 5.36423063 5.06702584 5.20334974]
Mean: 4.979732389160712
Standard deviation: 0.351427907749583


### 4.2.2. KMeans, Elastic Net and KNN

In [25]:
# Train a clustered linear regressor (with KNN)
kmeans_lins_knn = ClusteredLinearRegression(clusterer=KMeans(n_clusters=4, random_state=42), 
                                           estimator=ElasticNet(), 
                                           classifier=KNeighborsClassifier())
kmeans_lins_knn.fit(X_prep, y_prep)

{0: ElasticNet(), 1: ElasticNet(), 2: ElasticNet(), 3: ElasticNet()}

In [26]:
# Evaluate the training
root_mean_squared_error(y_prep, kmeans_lins_knn.predict(X_prep))

4.91821472527952

In [27]:
# K-fold cross validation on the model
scores = np.sqrt(-cross_val_score(kmeans_lins_knn, X_prep, y_prep, scoring="neg_mean_squared_error", 
                                  cv=10, error_score="raise"))
display_scores(scores)

Scores: [4.9879717  4.73217517 4.26984356 4.7768414  5.23016244 5.43161385
 4.62100648 5.44751899 5.53747539 5.10687049]
Mean: 5.014147947620382
Standard deviation: 0.39195370364471704


# 5. Fine-tune Hyperparameters

**Note:** Randomized Search is performed first to narrow down the search space, then Grid Search will be performed on that narrowed space. The result param could be optimal or not, but it is expected to construct a better model based on that param.

## 5.1. Clustered Linear Regression

### 5.1.1. KMeans and Elastic Net

#### Randomized Search

In [29]:
# Perform randomized search
param_distribs = {
    "clusterer__n_clusters": randint(low=1, high=50),
    "estimator__alpha": uniform(loc=1e-4, scale=1),
    "estimator__l1_ratio": uniform(loc=0.01, scale=0.09)
}

lins_rndsearch = RandomizedSearchCV(ClusteredLinearRegression(), param_distributions=param_distribs,
                                     n_iter=50, cv=10, scoring='neg_mean_squared_error', 
                                     error_score="raise", random_state=42)
lins_rndsearch.fit(X_prep, y_prep)

RandomizedSearchCV(cv=10, error_score='raise',
                   estimator=ClusteredLinearRegression(), n_iter=50,
                   param_distributions={'clusterer__n_clusters': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001BD1CE2F400>,
                                        'estimator__alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001BD1CE3D460>,
                                        'estimator__l1_ratio': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001BD178B7790>},
                   random_state=42, scoring='neg_mean_squared_error')

In [37]:
# Evaluate score
cvres = lins_rndsearch.cv_results_
min_loss = np.inf
opt_param = None
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    if min_loss > np.sqrt(-mean_score):
        min_loss = np.sqrt(-mean_score)
        opt_param = params
print("Best score:", min_loss)
print("Best param:", opt_param)

Best score: 3.224143382222099
Best param: {'clusterer__n_clusters': 42, 'estimator__alpha': 0.04676566321361543, 'estimator__l1_ratio': 0.09763799669573131}


In [40]:
# Show the best estimator
lins_rndsearch.best_estimator_

ClusteredLinearRegression(clusterer=KMeans(n_clusters=42, random_state=42),
                          estimator=ElasticNet(alpha=0.04676566321361543,
                                               l1_ratio=0.09763799669573131))

### 5.1.2. KMeans, ElasticNet and KNN

In [32]:
# Perform randomized search
param_distribs = {
    "clusterer__n_clusters": randint(low=1, high=50),
    "estimator__alpha": uniform(loc=1e-4, scale=1),
    "estimator__l1_ratio": uniform(loc=0.01, scale=0.09),
    "classifier__n_neighbors": randint(low=3, high=15),
    "classifier__weights": ["distance"]
}

kmeans_lins_knn_rndsearch = RandomizedSearchCV(ClusteredLinearRegression(classifier=KNeighborsClassifier()), 
                                     param_distributions=param_distribs,
                                     n_iter=50, cv=10, scoring='neg_mean_squared_error', 
                                     error_score="raise", random_state=42)
kmeans_lins_knn_rndsearch.fit(X_prep, y_prep)

RandomizedSearchCV(cv=10, error_score='raise',
                   estimator=ClusteredLinearRegression(classifier=KNeighborsClassifier()),
                   n_iter=50,
                   param_distributions={'classifier__n_neighbors': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001BD1CDB8EB0>,
                                        'classifier__weights': ['distance'],
                                        'clusterer__n_clusters': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001BD1CE5F9D0>,
                                        'estimator__alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001BD1789C7F0>,
                                        'estimator__l1_ratio': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001BD1CE3D880>},
                   random_state=42, scoring='neg_mean_squared_error')

In [38]:
# Evaluate score
cvres = kmeans_lins_knn_rndsearch.cv_results_
min_loss = np.inf
opt_param = None
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    if min_loss > np.sqrt(-mean_score):
        min_loss = np.sqrt(-mean_score)
        opt_param = params
print("Best score:", min_loss)
print("Best param:", opt_param)

Best score: 3.2030714717532476
Best param: {'classifier__n_neighbors': 13, 'classifier__weights': 'distance', 'clusterer__n_clusters': 42, 'estimator__alpha': 0.04676566321361543, 'estimator__l1_ratio': 0.09763799669573131}


In [39]:
# Show the best estimator
kmeans_lins_knn_rndsearch.best_estimator_

ClusteredLinearRegression(classifier=KNeighborsClassifier(n_neighbors=13,
                                                          weights='distance'),
                          clusterer=KMeans(n_clusters=42, random_state=42),
                          estimator=ElasticNet(alpha=0.04676566321361543,
                                               l1_ratio=0.09763799669573131))

# 6. Evaluate the System on the Test Set

In [25]:
# Recall test set
X_test_prep = full_pipeline.fit_transform(X_test)
y_test_prep = y_test["life_expectancy"].copy()

In [26]:
# Display score
def evaluate(y, y_hat):
    print("RMSE: ", root_mean_squared_error(y, y_hat))
    print("MAE: ", mean_absolute_error(y, y_hat))
    print("R2: ", r2_score(y, y_hat))

## 6.1. Clustered Linear Regression

### 6.1.1. Kmeans and Elastic Net

In [27]:
# Train a defaut
lin_regs = ClusteredLinearRegression(clusterer=KMeans(), 
                                     estimator=ElasticNet())
lin_regs.fit(X_prep, y_prep)
evaluate(y_prep, lin_regs.predict(X_prep))

RMSE:  5.004032507392436
MAE:  3.735802185694426
R2:  0.722892157774097


In [23]:
# Retrain the pure clustered regressor
final_kmeans_lins = ClusteredLinearRegression(clusterer=KMeans(n_clusters=42, random_state=42),
                                              estimator=ElasticNet(alpha=0.04676566321361543, l1_ratio=0.09763799669573131))
final_kmeans_lins.fit(X_prep, y_prep)
evaluate(y_prep, final_kmeans_lins.predict(X_prep))

RMSE:  2.6083772032732737
MAE:  1.9278512753076302
R2:  0.9247079080781789


In [24]:
# Evaluate the model on test set
y_test_pred = final_kmeans_lins.predict(X_test_prep)
evaluate(y_test_prep, y_test_pred)

RMSE:  3.2228855004965395
MAE:  2.309741297732761
R2:  0.8865192307843883


In [25]:
# Compute 95% confidence interval for generalization error (rmse only)
from scipy import stats
confidence = 0.95

squared_errors = (y_test_pred - y_test_prep) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                         loc=squared_errors.mean(),
                         scale=stats.sem(squared_errors))) # t-distribution 

array([2.87131977, 3.53970404])

### 6.1.2. Kmeans, Elastic Net and KNN

In [30]:
# Train a defaut
kmeans_lins_knn = ClusteredLinearRegression(clusterer=KMeans(), 
                                           estimator=ElasticNet(), 
                                           classifier=KNeighborsClassifier())
kmeans_lins_knn.fit(X_prep, y_prep)
evaluate(y_prep, kmeans_lins_knn.predict(X_prep))

RMSE:  4.997367945250951
MAE:  3.7143473626913175
R2:  0.7236297919163911


In [26]:
# Retrain the pure clustered regressor
final_kmeans_lins_knn = ClusteredLinearRegression(classifier=KNeighborsClassifier(n_neighbors=13, weights='distance'),
                                                  clusterer=KMeans(n_clusters=42, random_state=42),
                                                  estimator=ElasticNet(alpha=0.04676566321361543, l1_ratio=0.09763799669573131))
final_kmeans_lins_knn.fit(X_prep, y_prep)
evaluate(y_prep, final_kmeans_lins_knn.predict(X_prep))

RMSE:  2.6083772032732737
MAE:  1.9278512753076302
R2:  0.9247079080781789


In [27]:
# Evaluate the model on test set
y_test_pred = final_kmeans_lins_knn.predict(X_test_prep)
evaluate(y_test_prep, y_test_pred)

RMSE:  3.1814351000713206
MAE:  2.289209747555816
R2:  0.8894194732863653


In [28]:
# Compute 95% confidence interval for generalization error (rmse only)
from scipy import stats
confidence = 0.95

squared_errors = (y_test_pred - y_test_prep) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                         loc=squared_errors.mean(),
                         scale=stats.sem(squared_errors))) # t-distribution

array([2.85096809, 3.48066654])