In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# [Auto-mpg](https://archive.ics.uci.edu/ml/datasets/auto+mpg) Dataset

In [17]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error as MSE

In [2]:
auto = pd.read_csv('./auto.csv')
auto.head()

Unnamed: 0,mpg,displ,hp,weight,accel,origin,size
0,18.0,250.0,88,3139,14.5,US,15.0
1,9.0,304.0,193,4732,18.5,US,20.0
2,36.1,91.0,60,1800,16.4,Asia,10.0
3,18.5,250.0,98,3525,19.0,US,15.0
4,34.3,97.0,78,2188,15.8,Europe,10.0


In [4]:
X = auto.drop('mpg', axis=1)
X = pd.get_dummies(X, columns=['origin'], drop_first=False)
y = auto['mpg']

In [5]:
X.shape

(392, 8)

In [6]:
y.shape

(392,)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3,
                                                    random_state=1)

## Decision Tree Regressor

In [9]:
dt = DecisionTreeRegressor(max_depth=4,
                           min_samples_leaf=0.26,
                           random_state=1)

In [16]:
# perform 10-fold cv
mse_cv_scores = -cross_val_score(dt, X_train, y_train,
                                cv=10,
                                scoring='neg_mean_squared_error',
                                n_jobs=-1)

# compute rmse
rmse_cv = np.sqrt(np.mean(mse_cv_scores))
print(f'CV RMSE: {rmse_cv: .2f}')

CV RMSE:  5.14


In [18]:
# eval train rmse
dt.fit(X_train, y_train)

y_pred_train = dt.predict(X_train)

rmse_train = np.sqrt(MSE(y_train, y_pred_train))
print(f'Train RMSE: {rmse_train: .2f}')

Train RMSE:  5.15


Baseline RMSE is 5.1. Both Train and CV RMSE are greater than the baseline which signifies that probably is underfitted.

#  [Indian Liver Patient Dataset](https://archive.ics.uci.edu/ml/datasets/ILPD+(Indian+Liver+Patient+Dataset))

In [117]:
from sklearn.preprocessing import scale
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

In [98]:
liver = pd.read_csv('./liver.csv')
liver.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [99]:
liver.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         583 non-null    int64  
 1   Gender                      583 non-null    object 
 2   Total_Bilirubin             583 non-null    float64
 3   Direct_Bilirubin            583 non-null    float64
 4   Alkaline_Phosphotase        583 non-null    int64  
 5   Alamine_Aminotransferase    583 non-null    int64  
 6   Aspartate_Aminotransferase  583 non-null    int64  
 7   Total_Protiens              583 non-null    float64
 8   Albumin                     583 non-null    float64
 9   Albumin_and_Globulin_Ratio  579 non-null    float64
 10  Dataset                     583 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


In [100]:
np.sum(liver.isna()) / len(liver)
# <5% ok to drop

Age                           0.000000
Gender                        0.000000
Total_Bilirubin               0.000000
Direct_Bilirubin              0.000000
Alkaline_Phosphotase          0.000000
Alamine_Aminotransferase      0.000000
Aspartate_Aminotransferase    0.000000
Total_Protiens                0.000000
Albumin                       0.000000
Albumin_and_Globulin_Ratio    0.006861
Dataset                       0.000000
dtype: float64

In [101]:
liver.dropna(inplace=True)
liver.reset_index(drop=True, inplace=True)

In [102]:
liver.shape

(579, 11)

In [103]:
liver_ = liver.copy()
liver_.columns.values

array(['Age', 'Gender', 'Total_Bilirubin', 'Direct_Bilirubin',
       'Alkaline_Phosphotase', 'Alamine_Aminotransferase',
       'Aspartate_Aminotransferase', 'Total_Protiens', 'Albumin',
       'Albumin_and_Globulin_Ratio', 'Dataset'], dtype=object)

In [104]:
gender = liver_['Gender'].map({'Female': 0, 'Male': 1})
liver_.drop(columns=['Gender', 'Dataset'], inplace=True)
liver_ = scale(liver_)

In [106]:
cols = ['Age_std', 'Total_Bilirubin_std', 'Direct_Bilirubin_std',
        'Alkaline_Phosphotase_std', 'Alamine_Aminotransferase_std',
        'Aspartate_Aminotransferase_std', 'Total_Proteins_std', 'Albumin_std',
        'Albumin_and_Globulin_Ratio_std']
liver_prepped = pd.DataFrame(liver_, columns=cols)
liver_prepped['is_male'] = gender.astype('int')
liver_prepped.head()

Unnamed: 0,Age_std,Total_Bilirubin_std,Direct_Bilirubin_std,Alkaline_Phosphotase_std,Alamine_Aminotransferase_std,Aspartate_Aminotransferase_std,Total_Proteins_std,Albumin_std,Albumin_and_Globulin_Ratio_std,is_male
0,1.247403,-0.42032,-0.495414,-0.42887,-0.355832,-0.319111,0.293722,0.203446,-0.14739,0
1,1.062306,1.218936,1.423518,1.675083,-0.093573,-0.035962,0.939655,0.077462,-0.648461,1
2,1.062306,0.640375,0.926017,0.816243,-0.115428,-0.146459,0.478274,0.203446,-0.178707,1
3,0.815511,-0.372106,-0.388807,-0.449416,-0.36676,-0.312205,0.293722,0.329431,0.16578,1
4,1.679294,0.093956,0.179766,-0.395996,-0.295731,-0.177537,0.755102,-0.930414,-1.713237,1


In [107]:
X = liver_prepped.copy()
y = liver['Dataset'] #this col corresponds to whether a patient as liver disease or not

In [108]:
X.shape

(579, 10)

In [109]:
y.shape

(579,)

In [110]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    random_state=1)

In [111]:
X_train.shape

(405, 10)

In [112]:
X_test.shape

(174, 10)

## Ensemble Learning

In [115]:
# initiate diff models
lr = LogisticRegression(random_state=1)
knn = KNN(n_neighbors=27)
dt = DecisionTreeClassifier(min_samples_leaf=0.13,
                            random_state=1)

In [116]:
# define list of classifiers
classifiers = [('Logistic Regression', lr),
               ('K Nearest Neighbours', knn),
               ('Classification Tree', dt)]

In [119]:
# iterate over pre-defined list of classifiers
for clf_name, clf in classifiers:
    # fit to training
    clf.fit(X_train, y_train)
    
    # predict
    y_pred = clf.predict(X_test)
    
    # accuracy
    acc = accuracy_score(y_test, y_pred)
    print(f'{clf_name} : {acc: .3f}')

Logistic Regression :  0.759
K Nearest Neighbours :  0.701
Classification Tree :  0.730


Will there be better performance with a `VotingClassifier`?

In [120]:
# initiate a voting classifier
vc = VotingClassifier(estimators=classifiers)

In [121]:
# fit
vc.fit(X_train, y_train)

VotingClassifier(estimators=[('Logistic Regression',
                              LogisticRegression(random_state=1)),
                             ('K Nearest Neighbours',
                              KNeighborsClassifier(n_neighbors=27)),
                             ('Classification Tree',
                              DecisionTreeClassifier(min_samples_leaf=0.13,
                                                     random_state=1))])

In [122]:
# eval
y_pred = vc.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print(f'Voting Classifier: {acc: .3f}')

Voting Classifier:  0.770
