In [1]:
#import libraries 
import numpy as np
import pandas as pd
#import libraries for visualization
import matplotlib.pyplot as plt
import seaborn as sns
#libraries for preprocessing data
from sklearn.preprocessing import LabelEncoder
#libraries for machine learning
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score, precision_score, precision_recall_curve

#### Submitted got accuracy score 80.487805 pts on New Test Data unseen prediction values.

In [2]:
# reading the csv file

liver_data = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/liver_patient_data/indian_liver_patient_dataset.csv')

In [3]:
liver_data.columns

Index(['Age', 'Gender', 'Total_Bilirubin', 'Direct_Bilirubin',
       'Alkaline_Phosphotase', 'Alamine_Aminotransferase',
       'Aspartate_Aminotransferase', 'Total_Protiens', 'Albumin',
       'Albumin_and_Globulin_Ratio', 'Liver_Problem'],
      dtype='object')

#### Dataset & Data Description

This dataset was downloaded from the UCI ML Repository:

Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.

**Our goal is to build a logistic machine learning model that predicts whether a patient is healthy (non liver patient) or ill (liver patient) based on some clinical and demographic features (or input variables)**

##### Data Description:

This data set contains liver patient records and non liver patient records collected from North East of Andhra Pradesh, India. The "Liver_Problem" column is the target variable used to divide groups into liver patient ( Liver_Problem == 1) or not ( Liver_Problem == 2).

    * Liver_Problem == 1, implies the individual is a liver patient
    * Liver_Problem == 2, implies the individual is not a liver patient
   
The column names are as follows:

        * Age of the patient
        * Gender of the patient
        * Total Bilirubin
        * Direct Bilirubin
        * Alkaline Phosphotase
        * Alamine Aminotransferase
        * Aspartate Aminotransferase
        * Total Protiens
        * Albumin
        * Albumin and Globulin Ratio
        * Liver_Problem

In [4]:
liver_data.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Liver_Problem
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [5]:
liver_data.tail()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Liver_Problem
495,32,Male,0.7,0.2,165,31,29,6.1,3.0,0.96,2
496,24,Male,1.0,0.2,189,52,31,8.0,4.8,1.5,1
497,67,Male,2.2,1.1,198,42,39,7.2,3.0,0.7,1
498,68,Male,1.8,0.5,151,18,22,6.5,4.0,1.6,1
499,55,Male,3.6,1.6,349,40,70,7.2,2.9,0.6,1


In [6]:
liver_data.shape

(500, 11)

In [7]:
liver_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 11 columns):
Age                           500 non-null int64
Gender                        500 non-null object
Total_Bilirubin               500 non-null float64
Direct_Bilirubin              500 non-null float64
Alkaline_Phosphotase          500 non-null int64
Alamine_Aminotransferase      500 non-null int64
Aspartate_Aminotransferase    500 non-null int64
Total_Protiens                500 non-null float64
Albumin                       500 non-null float64
Albumin_and_Globulin_Ratio    496 non-null float64
Liver_Problem                 500 non-null int64
dtypes: float64(5), int64(5), object(1)
memory usage: 43.1+ KB


In [8]:
liver_data.isnull().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    4
Liver_Problem                 0
dtype: int64

In [9]:
liver_data.describe()

Unnamed: 0,Age,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Liver_Problem
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,496.0,500.0
mean,44.586,2.6038,1.1172,296.372,82.736,108.82,6.4448,3.1688,0.960907,1.3
std,16.5334,5.120238,2.066709,257.461676,194.366775,307.093557,1.08902,0.799741,0.294289,0.458717
min,4.0,0.4,0.1,63.0,10.0,10.0,2.7,0.9,0.3,1.0
25%,32.75,0.8,0.2,175.0,23.0,24.75,5.7,2.6,0.795,1.0
50%,45.0,0.9,0.3,205.5,33.5,40.0,6.5,3.1,1.0,1.0
75%,58.0,2.2,1.0,298.0,59.25,79.25,7.2,3.8,1.1,2.0
max,85.0,75.0,14.2,2110.0,2000.0,4929.0,9.6,5.5,1.9,2.0


In [10]:
liver_data.corr()

Unnamed: 0,Age,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Liver_Problem
Age,1.0,0.057387,0.066546,0.087363,-0.095215,-0.027219,-0.21949,-0.274713,-0.202507,-0.150588
Total_Bilirubin,0.057387,1.0,0.781711,0.268049,0.273827,0.248864,-0.056948,-0.188618,-0.24189,-0.183162
Direct_Bilirubin,0.066546,0.781711,1.0,0.345133,0.336719,0.305176,-0.052779,-0.204776,-0.271533,-0.223604
Alkaline_Phosphotase,0.087363,0.268049,0.345133,1.0,0.123885,0.168421,-0.034374,-0.177984,-0.280934,-0.192232
Alamine_Aminotransferase,-0.095215,0.273827,0.336719,0.123885,1.0,0.793521,-0.038924,-0.025469,0.006784,-0.163415
Aspartate_Aminotransferase,-0.027219,0.248864,0.305176,0.168421,0.793521,1.0,-0.029802,-0.071393,-0.062038,-0.143285
Total_Protiens,-0.21949,-0.056948,-0.052779,-0.034374,-0.038924,-0.029802,1.0,0.832103,0.297391,0.029205
Albumin,-0.274713,-0.188618,-0.204776,-0.177984,-0.025469,-0.071393,0.832103,1.0,0.740581,0.140282
Albumin_and_Globulin_Ratio,-0.202507,-0.24189,-0.271533,-0.280934,0.006784,-0.062038,0.297391,0.740581,1.0,0.179792
Liver_Problem,-0.150588,-0.183162,-0.223604,-0.192232,-0.163415,-0.143285,0.029205,0.140282,0.179792,1.0


In [11]:
liver_data['Liver_Problem'].value_counts()

1    350
2    150
Name: Liver_Problem, dtype: int64

In [12]:
liver_data[liver_data['Albumin_and_Globulin_Ratio'].isnull()]

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Liver_Problem
209,45,Female,0.9,0.3,189,23,33,6.6,3.9,,1
241,51,Male,0.8,0.2,230,24,46,6.5,3.1,,1
253,35,Female,0.6,0.2,180,12,15,5.2,2.7,,2
312,27,Male,1.3,0.6,106,25,54,8.5,4.8,,2


In [13]:
x = liver_data.groupby(['Liver_Problem'])['Albumin_and_Globulin_Ratio'].mean()
x

Liver_Problem
1    0.926437
2    1.041959
Name: Albumin_and_Globulin_Ratio, dtype: float64

In [14]:
x[1]

0.926436781609196

In [15]:
x[2]

1.0419594594594592

In [16]:
liver_data.loc[(liver_data['Liver_Problem'] == 1) & pd.isnull(liver_data['Albumin_and_Globulin_Ratio']), ['Albumin_and_Globulin_Ratio']] = x[1]

In [17]:
liver_data.loc[(liver_data['Liver_Problem'] == 2) & pd.isnull(liver_data['Albumin_and_Globulin_Ratio']), ['Albumin_and_Globulin_Ratio']] = x[2]

In [18]:
liver_data['Albumin_and_Globulin_Ratio'].isnull().sum()

0

In [19]:
le = LabelEncoder()
liver_data['Gender'] = le.fit_transform(liver_data['Gender'])
liver_data.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Liver_Problem
0,65,0,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,1,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,1,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,1,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,1,3.9,2.0,195,27,59,7.3,2.4,0.4,1


#### Separating input and target variables

In [20]:
X = liver_data.drop('Liver_Problem', axis = 1)
y = liver_data['Liver_Problem']

#### Splitting into train and test set

In [21]:
#Splitting into train and test set in 90:10 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 21)

#### Building Logistic Regression model

In [22]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [23]:
y_pred = lr.predict(X_test)

In [24]:
liver_data.columns

Index(['Age', 'Gender', 'Total_Bilirubin', 'Direct_Bilirubin',
       'Alkaline_Phosphotase', 'Alamine_Aminotransferase',
       'Aspartate_Aminotransferase', 'Total_Protiens', 'Albumin',
       'Albumin_and_Globulin_Ratio', 'Liver_Problem'],
      dtype='object')

In [25]:
np.round(lr.coef_, 2) > 0

array([[False,  True, False, False, False, False, False, False,  True,
        False]])

In [26]:
print("accuracy score: ",  accuracy_score(y_test, y_pred))

accuracy score:  0.8


In [27]:
print("Train data f1 score: ", f1_score(y_train,lr.predict(X_train)))
print("Test data f1 score: ", f1_score(y_test,y_pred))

Train data f1 score:  0.8099415204678363
Test data f1 score:  0.8863636363636364


In [28]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()     # ravel() is used to convert a 2D array to 1D array. The output by confusion matrix is a 2D array.
print("True Positive", tp)
print("True Negative", tn)
print("False Positive", fp)
print("False Negative", fn)

True Positive 1
True Negative 39
False Positive 2
False Negative 8


In [29]:
# Recall = TP / (TP + FN)
# Precision = TP / (TP + FP)
print("recall score: ", recall_score(y_test,y_pred))
print("precision score: ", precision_score(y_test,y_pred))

recall score:  0.9512195121951219
precision score:  0.8297872340425532


### Load New Test Data

In [30]:
test_new = pd.read_csv('https://raw.githubusercontent.com/dphi-official/Datasets/master/liver_patient_data/indian_liver_patient_new_testdataset.csv')

In [31]:
test_new.columns

Index(['Age', 'Gender', 'Total_Bilirubin', 'Direct_Bilirubin',
       'Alkaline_Phosphotase', 'Alamine_Aminotransferase',
       'Aspartate_Aminotransferase', 'Total_Protiens', 'Albumin',
       'Albumin_and_Globulin_Ratio'],
      dtype='object')

In [32]:
test_new.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio
0,36,Male,2.8,1.5,305,28,76,5.9,2.5,0.7
1,42,Male,0.8,0.2,127,29,30,4.9,2.7,1.2
2,53,Male,19.8,10.4,238,39,221,8.1,2.5,0.4
3,32,Male,30.5,17.1,218,39,79,5.5,2.7,0.9
4,32,Male,32.6,14.1,219,95,235,5.8,3.1,1.1


In [33]:
test_new.shape

(82, 10)

In [34]:
test_new['Gender'] = le.fit_transform(test_new['Gender'])
test_new.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio
0,36,1,2.8,1.5,305,28,76,5.9,2.5,0.7
1,42,1,0.8,0.2,127,29,30,4.9,2.7,1.2
2,53,1,19.8,10.4,238,39,221,8.1,2.5,0.4
3,32,1,30.5,17.1,218,39,79,5.5,2.7,0.9
4,32,1,32.6,14.1,219,95,235,5.8,3.1,1.1


In [35]:
newtest_pred = lr.predict(test_new)

In [36]:
res = pd.DataFrame(newtest_pred)
res.index = test_new.index # its important for comparison
res.columns = ['Liver_Problem']
res.to_csv("'Liver_Problem'_results_9010AllColumns.csv")      # the csv file will be saved locally on the same location where this notebook is located.