In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from scipy import stats

# Load dataset
df = pd.read_csv(r'E:\DSBDAL\DSBDALExam DataSets\DSBDALExam DataSets\Hepatitis\hepatitis.csv')

# Data cleaning
df.replace(['?', ' ', 'NA'], np.nan, inplace=True)
df.dropna(inplace=True)
numeric_cols = df.select_dtypes(include=[np.number]).columns
df = df[(df[numeric_cols] >= 0).all(axis=1)]

# Error correcting (Outlier removal)
z_scores = stats.zscore(df.select_dtypes(include=[np.number]))
abs_z_scores = np.abs(z_scores)
df = df[(abs_z_scores < 3).all(axis=1)]

# Convert boolean columns to integers
bool_cols = df.select_dtypes(include=['bool']).columns
df[bool_cols] = df[bool_cols].astype(int)

# Data transformation
le = LabelEncoder()
df['sex'] = le.fit_transform(df['sex'])

# Encode target variable and save class names
target_encoder = LabelEncoder()
df['class'] = target_encoder.fit_transform(df['class'])
class_names = target_encoder.classes_

# Split features and target
X = df.drop('class', axis=1)
y = df['class']

# Scale numeric features
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

# Naïve Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_pred = nb.predict(X_test)

# Print results
print("Logistic Regression:")
print(f"Accuracy: {accuracy_score(y_test, lr_pred):.2f}")
print(classification_report(y_test, lr_pred, target_names=class_names))

print("\nNaïve Bayes:")
print(f"Accuracy: {accuracy_score(y_test, nb_pred):.2f}")
print(classification_report(y_test, nb_pred, target_names=class_names))

Logistic Regression:
Accuracy: 0.87
              precision    recall  f1-score   support

         die       1.00      0.25      0.40         4
        live       0.86      1.00      0.93        19

    accuracy                           0.87        23
   macro avg       0.93      0.62      0.66        23
weighted avg       0.89      0.87      0.84        23


Naïve Bayes:
Accuracy: 0.91
              precision    recall  f1-score   support

         die       0.67      1.00      0.80         4
        live       1.00      0.89      0.94        19

    accuracy                           0.91        23
   macro avg       0.83      0.95      0.87        23
weighted avg       0.94      0.91      0.92        23

