In [2]:
# Import libraries
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv('liver_data.csv')
df.head()


Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [3]:
# Rename target column
df.rename(columns={'Dataset': 'Target'}, inplace=True)

# Convert 2 → 0 for binary classification
df['Target'] = df['Target'].replace(2, 0)

# Drop rows with missing values
df.dropna(inplace=True)

# Encode Gender column: Male → 1, Female → 0
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})

# Check data
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 579 entries, 0 to 582
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         579 non-null    int64  
 1   Gender                      579 non-null    int64  
 2   Total_Bilirubin             579 non-null    float64
 3   Direct_Bilirubin            579 non-null    float64
 4   Alkaline_Phosphotase        579 non-null    int64  
 5   Alamine_Aminotransferase    579 non-null    int64  
 6   Aspartate_Aminotransferase  579 non-null    int64  
 7   Total_Protiens              579 non-null    float64
 8   Albumin                     579 non-null    float64
 9   Albumin_and_Globulin_Ratio  579 non-null    float64
 10  Target                      579 non-null    int64  
dtypes: float64(5), int64(6)
memory usage: 54.3 KB


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Separate features and labels
X = df.drop('Target', axis=1)
y = df['Target']

# Normalize features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Save scaler
import pickle
pickle.dump(scaler, open("normalizer.pkl", "wb"))

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42)


In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Predict
y_pred_dt = dt_model.predict(X_test)

# Evaluate
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))


Decision Tree Accuracy: 0.5862068965517241
              precision    recall  f1-score   support

           0       0.41      0.28      0.33        43
           1       0.64      0.77      0.70        73

    accuracy                           0.59       116
   macro avg       0.53      0.52      0.52       116
weighted avg       0.56      0.59      0.56       116



In [6]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict
y_pred_rf = rf_model.predict(X_test)

# Evaluate
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.6637931034482759
              precision    recall  f1-score   support

           0       0.59      0.30      0.40        43
           1       0.68      0.88      0.77        73

    accuracy                           0.66       116
   macro avg       0.64      0.59      0.58       116
weighted avg       0.65      0.66      0.63       116



In [7]:
# If not installed: pip install xgboost
from xgboost import XGBClassifier

# Initialize and train
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# Predict
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Accuracy: 0.646551724137931
              precision    recall  f1-score   support

           0       0.54      0.30      0.39        43
           1       0.67      0.85      0.75        73

    accuracy                           0.65       116
   macro avg       0.61      0.58      0.57       116
weighted avg       0.62      0.65      0.62       116



In [8]:
# Print all model accuracies side by side
print("🔎 Model Comparison:")

print("Decision Tree Accuracy   :", accuracy_score(y_test, y_pred_dt))
print("Random Forest Accuracy   :", accuracy_score(y_test, y_pred_rf))
print("XGBoost Accuracy          :", accuracy_score(y_test, y_pred_xgb))


🔎 Model Comparison:
Decision Tree Accuracy   : 0.5862068965517241
Random Forest Accuracy   : 0.6637931034482759
XGBoost Accuracy          : 0.646551724137931


In [9]:
import pickle

# Save model
pickle.dump(rf_model, open("rf_acc_68.pkl", "wb"))

# Save scaler (already done earlier, but repeat if needed)
pickle.dump(scaler, open("normalizer.pkl", "wb"))

print("✅ Model and scaler saved!")


✅ Model and scaler saved!


In [10]:
y.value_counts()


Target
1    414
0    165
Name: count, dtype: int64

In [11]:
class_weight='balanced'


In [12]:
import pickle
pickle.dump(rf_model, open("rf_acc_68.pkl", "wb"))
