In [2]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix



In [3]:
# Step 1: Load the dataset

df = pd.read_csv('../Data/liver_cirrhosis.csv')

print("Dataset loaded successfully. Columns are:")
print(df.columns)


Dataset loaded successfully. Columns are:
Index(['N_Days', 'Status', 'Drug', 'Age', 'Sex', 'Ascites', 'Hepatomegaly',
       'Spiders', 'Edema', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper',
       'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin',
       'Stage'],
      dtype='object')


In [4]:
# Convert 'Age' from days to years if applicable
if 'Age' in df.columns:
    df['Age'] = (df['Age'] / 365).astype(int)


In [5]:
# Define the features (X) and target variable (y)
X = df.drop(['Stage'], axis=1)  # Features
y = df['Stage']                # Target variable


In [6]:
# Step 3: Define the types of features
# Numeric features
numeric_features = [col for col in X.columns if X[col].dtype != 'O']
# Categorical features
categorical_features = [col for col in X.columns if X[col].dtype == 'O']

print("\nNumeric Features:", numeric_features)
print("Categorical Features:", categorical_features)



Numeric Features: ['N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin']
Categorical Features: ['Status', 'Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema']


In [7]:
# Step 4: Preprocessing
# Standardize numeric data and encode categorical data
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')


In [8]:

# Combine both transformers into a single preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),  # Apply StandardScaler to numeric features
        ("cat", categorical_transformer, categorical_features),  # Apply OneHotEncoder to categorical features
    ]
)

In [9]:

# Step 5: Create the pipeline
# The pipeline will handle preprocessing and training
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Preprocessing step
    ('classifier', RandomForestClassifier(random_state=42))  # Random Forest Classifier
])

In [10]:
# Step 6: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)


In [11]:

# Step 7: Train the model using the pipeline
pipeline.fit(X_train, y_train)


In [12]:
# Step 8: Make predictions
y_pred = pipeline.predict(X_test)


In [13]:
# Step 9: Evaluate the model
# Function to evaluate and print results
def evaluate_model(true, predicted):
    print("Confusion Matrix:\n", confusion_matrix(true, predicted))
    print(f"Accuracy: {accuracy_score(true, predicted):.2f}")
    print("\nClassification Report:\n", classification_report(true, predicted))

# Call the evaluation function
evaluate_model(y_test, y_pred)

Confusion Matrix:
 [[1940   89   37]
 [  63 2011   36]
 [  22   47 2005]]
Accuracy: 0.95

Classification Report:
               precision    recall  f1-score   support

           1       0.96      0.94      0.95      2066
           2       0.94      0.95      0.94      2110
           3       0.96      0.97      0.97      2074

    accuracy                           0.95      6250
   macro avg       0.95      0.95      0.95      6250
weighted avg       0.95      0.95      0.95      6250



In [14]:
#Save the model
import joblib
model_filename = "liver_cirrhosis_pipeline_model.pkl"
joblib.dump(pipeline, model_filename)
print(f"\nTrained model saved as '{model_filename}'.")



Trained model saved as 'liver_cirrhosis_pipeline_model.pkl'.
