# Predict Liver Disease

* Using Ensemble of LogReg, kNN, Decision tree models to predict if a perso has Liver disease

Dataset https://archive.ics.uci.edu/ml/datasets/ILPD+(Indian+Liver+Patient+Dataset)

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline
import os

In [2]:
# Import models
from sklearn.linear_model import LogisticRegression as LR
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.neighbors import KNeighborsClassifier as KNN

# Train test split
from sklearn.model_selection import train_test_split

# Accuracy measures
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix as cfsn

# Voting Classifier
from sklearn.ensemble import VotingClassifier

seed = 1

In [3]:
df_org = pd.read_csv("Indian Liver Patient Dataset.csv")
df_org.head()

Unnamed: 0,age,gender,tot_bilirubin,direct_bilirubin,tot_proteins,albumin,ag_ratio,sgpt,sgot,alkphos,is_patient
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [5]:
df_org.isnull().sum()

age                 0
gender              0
tot_bilirubin       0
direct_bilirubin    0
tot_proteins        0
albumin             0
ag_ratio            0
sgpt                0
sgot                0
alkphos             4
is_patient          0
dtype: int64

In [6]:
df_org.alkphos.unique()

array([ 0.9 ,  0.74,  0.89,  1.  ,  0.4 ,  1.3 ,  1.1 ,  1.2 ,  0.8 ,
        0.6 ,  0.87,  0.7 ,  0.92,  0.55,  0.5 ,  1.85,  0.95,  1.4 ,
        1.18,  0.61,  1.34,  1.39,  1.6 ,  1.58,  1.25,  0.78,  0.76,
        1.55,  0.71,  0.62,  0.67,  0.75,  1.16,  1.5 ,  1.66,  0.96,
        1.38,  0.52,  0.47,  0.93,  0.48,  0.58,  0.69,  1.27,  1.12,
        1.06,  0.53,  1.03,  0.68,   nan,  1.9 ,  1.7 ,  1.8 ,  0.3 ,
        0.97,  0.35,  1.51,  0.64,  0.45,  1.36,  0.88,  1.09,  1.11,
        1.72,  2.8 ,  0.46,  0.39,  1.02,  2.5 ,  0.37])

In [8]:
df_org.shape

(583, 11)

In [30]:
# Remove NAs from alkphos column
df_org = df_org[df_org.alkphos > 0]
df_org.shape

(579, 11)

In [31]:
X = df_org.iloc[:,0:10]
y = df_org.iloc[:,10]

In [32]:
X.head()

Unnamed: 0,age,gender,tot_bilirubin,direct_bilirubin,tot_proteins,albumin,ag_ratio,sgpt,sgot,alkphos
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4


In [33]:
X.gender.value_counts()

Male      439
Female    140
Name: gender, dtype: int64

In [34]:
X = X.replace(regex={'Female':1, 'Male':0})

In [35]:
X.gender.value_counts()

0    439
1    140
Name: gender, dtype: int64

In [36]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: is_patient, dtype: int64

In [15]:
# Instantiate lr
lr = LR(random_state=seed)

# Instantiate knn
knn = KNN(n_neighbors=27)

# Instantiate dt
dt = DT(min_samples_leaf=0.13, random_state=seed)

# Define the list classifiers
classifiers = [('Logistic Regression', lr), ('K Nearest Neighbours', knn), ('Classification Tree', dt)]

In [37]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.3, random_state = 1)

In [18]:
# Define a list with all models

classifiers = [("LogReg", lr), ("kNN", knn), 
               ("Decision Tree", dt)]

In [39]:
# Loop to iterate for all models

for clf_name, clf in classifiers:
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    cnf_matrix = cfsn(y_test, y_pred)
    
    
    print('{:s} : {:.3f}'.format(clf_name, accuracy_score(y_test,y_pred)))
    print(cnf_matrix)
    

LogReg : 0.747
[[120   9]
 [ 35  10]]
kNN : 0.684
[[107  22]
 [ 33  12]]
Decision Tree : 0.730
[[108  21]
 [ 26  19]]


In [41]:
# Initiate VotingClassifier
vc = VotingClassifier(estimators=classifiers)

# Fit to train set
vc.fit(X_train, y_train)

# Predict test set labels
y_pred = vc.predict(X_test)

cnf_matrix = cfsn(y_test, y_pred)

print("Voting Classifier: ",  accuracy_score(y_test,y_pred) )
print(cnf_matrix)

Voting Classifier:  0.718390804598
[[114  15]
 [ 34  11]]


### Obs

* We have not optimized our Log Reg model , we need to transform Vars to Normal distribution
* Ensemble takes the predictions from all 3 models on max.votes
* Here we have incorrect predictions for some observations from 2 of our models leading to less score on Ensemble model

# Bagging

In [42]:
# Import BaggingClassifier
from sklearn.ensemble import BaggingClassifier

# Instantiate bc
bc = BaggingClassifier(base_estimator=dt, n_estimators=50, random_state=1)

In [43]:
# Fit bc to the training set
bc.fit(X_train, y_train)

# Predict test set labels
y_pred = bc.predict(X_test)

# Evaluate acc_test
acc_test = accuracy_score(y_test, y_pred)
print('Test set accuracy of bc: {:.2f}'.format(acc_test))

Test set accuracy of bc: 0.73


## Testing the model on Train data prediction

* This is to check the Bias vs Variance if there is a difference
* We observe that the model has both Bias and Var equally

In [44]:
# Predict test set labels
y_pred_tr = bc.predict(X_train)

# Evaluate acc_test
acc_test = accuracy_score(y_train, y_pred_tr)
print('Test set accuracy of bc: {:.2f}'.format(acc_test))

Test set accuracy of bc: 0.72
