In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
#from catboost import CatBoostClassifier
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("rabieelkharoua/predict-liver-disease-1700-records-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/rabieelkharoua/predict-liver-disease-1700-records-dataset?dataset_version_number=1...


100%|██████████| 69.6k/69.6k [00:00<00:00, 365kB/s]

Extracting files...
Path to dataset files: C:\Users\Office\.cache\kagglehub\datasets\rabieelkharoua\predict-liver-disease-1700-records-dataset\versions\1





In [3]:
df = pd.read_csv(path + "/Liver_disease_data.csv")

# Check for missing values
print(df.isnull().sum())


Age                   0
Gender                0
BMI                   0
AlcoholConsumption    0
Smoking               0
GeneticRisk           0
PhysicalActivity      0
Diabetes              0
Hypertension          0
LiverFunctionTest     0
Diagnosis             0
dtype: int64


In [4]:
# Check for duplicates
df = df.drop_duplicates()

In [5]:
skewed_features = ['Diabetes', 'Hypertension', 'Smoking']
# Print the actual column names in your DataFrame
print(df.columns)

# Check if the columns in skewed_features exist in the DataFrame
for col in skewed_features:
    if col not in df.columns:
        print(f"Column '{col}' not found in DataFrame")
        # Handle the missing column: either skip, rename, or create a new one

# Apply log1p to normalize skewed features
for col in skewed_features:
    if col in df.columns: # Only process if the column exists
        df[col] = np.log1p(df[col])

# Outlier removal using IQR method
Q1 = df[skewed_features].quantile(0.25)
Q3 = df[skewed_features].quantile(0.75)
IQR = Q3 - Q1

# Filter the DataFrame to remove outliers
mask = ~((df[skewed_features] < (Q1 - 1.5 * IQR)) | (df[skewed_features] > (Q3 + 1.5 * IQR))).any(axis=1)
df = df[mask]

Index(['Age', 'Gender', 'BMI', 'AlcoholConsumption', 'Smoking', 'GeneticRisk',
       'PhysicalActivity', 'Diabetes', 'Hypertension', 'LiverFunctionTest',
       'Diagnosis'],
      dtype='object')


In [6]:
X = df.drop("Diagnosis", axis=1)
y = df["Diagnosis"]

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [7]:
from sklearn.ensemble import ExtraTreesClassifier

# Feature importance with ExtraTrees
model = ExtraTreesClassifier()
model.fit(X_train, y_train)

# Sort by importance
feature_importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
print(feature_importances)

# Remove least important features
important_features = feature_importances[feature_importances > 0.01].index  # Keep important ones
X_train_selected = X_train[important_features]
X_test_selected = X_test[important_features]


AlcoholConsumption    0.237373
LiverFunctionTest     0.216812
BMI                   0.129906
Age                   0.124268
PhysicalActivity      0.111110
GeneticRisk           0.078231
Gender                0.053539
Smoking               0.048762
Diabetes              0.000000
Hypertension          0.000000
dtype: float64


In [8]:
important_features

Index(['AlcoholConsumption', 'LiverFunctionTest', 'BMI', 'Age',
       'PhysicalActivity', 'GeneticRisk', 'Gender', 'Smoking'],
      dtype='object')

In [8]:
smote = SMOTE(random_state=42)
X_bal, y_bal = smote.fit_resample(X_train_selected, y_train)

In [9]:
xgb = XGBClassifier(use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    colsample_bytree=0.6,
    gamma=0.2,
    learning_rate=0.1,
    max_depth=3,
    n_estimators=100,
    reg_alpha=0.1,
    reg_lambda=2,
    subsample=1,
    min_child_weight=5,
                    )
xgb.fit(X_bal, y_bal)
y_pred_xgb = xgb.predict(X_test_selected)

print("XGBoost:\n", classification_report(y_test, y_pred_xgb))

XGBoost:
               precision    recall  f1-score   support

           0       0.95      0.98      0.96       127
           1       0.97      0.95      0.96       122

    accuracy                           0.96       249
   macro avg       0.96      0.96      0.96       249
weighted avg       0.96      0.96      0.96       249



Parameters: { "use_label_encoder" } are not used.



In [35]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import StratifiedKFold
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0.5, 1.0, 2.0],
    'min_child_weight': [1, 3, 5]
}

# Create the XGBoost model
xgb = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)

# Create a custom scorer (F1 score is often good for imbalanced datasets)
scorer = make_scorer(f1_score, average='weighted')

# Set up the grid search
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring=scorer,
    cv=StratifiedKFold(3),  # Using stratified KFold for imbalanced data
    n_jobs=-1,  # Use all available cores
    verbose=2  # Show progress
)

# Run the grid search
print("Starting grid search...")
grid_search.fit(X_bal, y_bal)

# Print the best parameters and score
print("\nBest parameters found:")
print(grid_search.best_params_)

Starting grid search...
Fitting 3 folds for each of 19683 candidates, totalling 59049 fits

Best parameters found:
{'colsample_bytree': 0.6, 'gamma': 0.2, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 100, 'reg_alpha': 0.1, 'reg_lambda': 2.0, 'subsample': 1.0}


Parameters: { "use_label_encoder" } are not used.



In [10]:
# prompt: I need to save this model by pikle in drive in folder called"GP" and create a folder called "Diabetes" and save it in it

import pickle
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define the path to save the model
model_folder_path = '/content/drive/My Drive/GP/Liver disease'
model_file_path = os.path.join(model_folder_path, 'new_xgboost_model.pkl')

# Create the directories if they don't exist
os.makedirs(model_folder_path, exist_ok=True)

# Save the trained model using pickle
with open(model_file_path, 'wb') as file:
    pickle.dump(xgb, file)

print(f"Model saved successfully to {model_file_path}")

Mounted at /content/drive
Model saved successfully to /content/drive/My Drive/GP/Liver disease/new_xgboost_model.pkl
