In [1]:
import operator

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFECV

In [2]:
liverdata = pd.read_csv('C:\\projects\\liverdata\\indian_liver_patient.csv')
liverdata['Gender'] = liverdata['Gender'].replace({'Male': 1, 'Female': 0})
#liverdata = liverdata.dropna()
liverdata.head()

  liverdata['Gender'] = liverdata['Gender'].replace({'Male': 1, 'Female': 0})


Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,0,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,1,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,1,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,1,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,1,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [3]:
liverdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         583 non-null    int64  
 1   Gender                      583 non-null    int64  
 2   Total_Bilirubin             583 non-null    float64
 3   Direct_Bilirubin            583 non-null    float64
 4   Alkaline_Phosphotase        583 non-null    int64  
 5   Alamine_Aminotransferase    583 non-null    int64  
 6   Aspartate_Aminotransferase  583 non-null    int64  
 7   Total_Protiens              583 non-null    float64
 8   Albumin                     583 non-null    float64
 9   Albumin_and_Globulin_Ratio  579 non-null    float64
 10  Dataset                     583 non-null    int64  
dtypes: float64(5), int64(6)
memory usage: 50.2 KB


In [4]:
liverdata.groupby('Dataset').size()

Dataset
1    416
2    167
dtype: int64

In [5]:
liverdata.isnull().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    4
Dataset                       0
dtype: int64

<h2>Step 2: Feature Engineering </h2>

In [7]:
feature_names = liverdata.columns[:10]
feature_names

Index(['Age', 'Gender', 'Total_Bilirubin', 'Direct_Bilirubin',
       'Alkaline_Phosphotase', 'Alamine_Aminotransferase',
       'Aspartate_Aminotransferase', 'Total_Protiens', 'Albumin',
       'Albumin_and_Globulin_Ratio'],
      dtype='object')

In [8]:
X = liverdata[feature_names]
Y = liverdata.Dataset

<h2> Step 3: Data standardization </h2>

<h4>Standardize features by removing the mean and scaling to unit variance.</h4>

In [10]:
# Features chosen based on RFECV result
# best_features = ['Age', 'Gender', 'Direct_Bilirubin','Alkaline_Phosphotase', 'Alamine_Aminotransferase', 'Total_Protiens','Albumin_and_Globulin_Ratio']
best_features = ['Age','Gender', 'Direct_Bilirubin','Alkaline_Phosphotase', 'Alamine_Aminotransferase', 'Albumin']

X_mod = StandardScaler().fit_transform(X[best_features])
y_mod = Y

In [11]:
# Split your data into training and testing (80% / 20%)
X_train, X_test, y_train, y_test = train_test_split(
    X_mod,
    y_mod,
    random_state=42,
    test_size=0.20
)

Principal component analysis (PCA)¶
The main goal of a PCA analysis is to identify patterns in data. PCA aims to detect the correlation between variables. If a strong correlation between variables exists, the attempt to reduce the dimensionality only makes sense.

In [13]:
pca = PCA(n_components=2)

X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

print(pca.explained_variance_ratio_)
print('PCA sum: {:.2f}%'.format(sum(pca.explained_variance_ratio_) * 100))

[0.26460569 0.20469215]
PCA sum: 46.93%


In [14]:
logreg = LogisticRegression()

# Train the model using the training sets and check score
logreg.fit(X_train, y_train)

# Predict Output
log_predicted= logreg.predict(X_test)

logreg_score = round(logreg.score(X_train, y_train) * 100, 2)
logreg_score_test = round(logreg.score(X_test, y_test) * 100, 2)

# Equation coefficient and Intercept
print('Logistic Regression Training Score: \n', logreg_score)
print('Logistic Regression Test Score: \n', logreg_score_test)

print('Accuracy: \n', accuracy_score(y_test,log_predicted))
print('Confusion Matrix: \n', confusion_matrix(y_test,log_predicted))
print('Classification Report: \n', classification_report(y_test,log_predicted))

Logistic Regression Training Score: 
 68.88
Logistic Regression Test Score: 
 70.09
Accuracy: 
 0.7008547008547008
Confusion Matrix: 
 [[78  9]
 [26  4]]
Classification Report: 
               precision    recall  f1-score   support

           1       0.75      0.90      0.82        87
           2       0.31      0.13      0.19        30

    accuracy                           0.70       117
   macro avg       0.53      0.51      0.50       117
weighted avg       0.64      0.70      0.66       117



<H2> Random Forest</h2>

In [16]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
# Predict Output
rf_predicted = random_forest.predict(X_test)

random_forest_score = round(random_forest.score(X_train, y_train) * 100, 2)
random_forest_score_test = round(random_forest.score(X_test, y_test) * 100, 2)
print('Random Forest Score: \n', random_forest_score)
print('Random Forest Test Score: \n', random_forest_score_test)
print('Accuracy: \n', accuracy_score(y_test,rf_predicted))
print(confusion_matrix(y_test,rf_predicted))
print(classification_report(y_test,rf_predicted))

Random Forest Score: 
 100.0
Random Forest Test Score: 
 78.63
Accuracy: 
 0.7863247863247863
[[78  9]
 [16 14]]
              precision    recall  f1-score   support

           1       0.83      0.90      0.86        87
           2       0.61      0.47      0.53        30

    accuracy                           0.79       117
   macro avg       0.72      0.68      0.70       117
weighted avg       0.77      0.79      0.78       117



In [17]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
best_rf = grid_search.best_estimator_
best_rf_predicted = best_rf.predict(X_test)

print('Best Random Forest Test Score: \n', round(best_rf.score(X_test, y_test) * 100, 2))
print('Accuracy: \n', accuracy_score(y_test, best_rf_predicted))
print(confusion_matrix(y_test, best_rf_predicted))
print(classification_report(y_test, best_rf_predicted))


Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters found:  {'max_depth': 30, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
Best Random Forest Test Score: 
 76.07
Accuracy: 
 0.7606837606837606
[[76 11]
 [17 13]]
              precision    recall  f1-score   support

           1       0.82      0.87      0.84        87
           2       0.54      0.43      0.48        30

    accuracy                           0.76       117
   macro avg       0.68      0.65      0.66       117
weighted avg       0.75      0.76      0.75       117



 <h2> Convolutional Neural Network (CNN) model</h2>

In [19]:
import keras

In [43]:
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.utils import to_categorical

# Ensure y_train and y_test are properly one-hot encoded
num_classes = 10  # Update this to your actual number of classes
y_train = to_categorical(y_train, num_classes=num_classes)
y_test = to_categorical(y_test, num_classes=num_classes)

# Define the model
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(64, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))  # Output layer should match the number of classes

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print('Test accuracy: ', accuracy)


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


ValueError: Arguments `target` and `output` must have the same rank (ndim). Received: target.shape=(None, 3, 2, 2, 2, 2, 2, 2, 10), output.shape=(None, 10)