In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# Load data
df = pd.read_csv('/Users/akshay/Desktop/dsbda_practical/DSBDALExam DataSets/DSBDALExam DataSets/Hepatitis/hepatitis.csv', header=None)

# Assign column names
df.columns = [
    "Class", "AGE", "SEX", "STEROID", "ANTIVIRALS", "FATIGUE", "MALAISE",
    "ANOREXIA", "LIVER_BIG", "LIVER_FIRM", "SPLEEN_PALPABLE", "SPIDERS",
    "ASCITES", "VARICES", "BILIRUBIN", "ALK_PHOSPHATE", "SGOT", "ALBUMIN",
    "PROTIME", "HISTOLOGY"
]

# Replace "?" with NaN
df.replace("?", np.nan, inplace=True)

# Convert all columns to numeric where possible
df = df.apply(pd.to_numeric, errors='coerce')

# q. Data Cleaning: remove negative values and handle missing values
df[df < 0] = np.nan  # Remove negative values
imputer = SimpleImputer(strategy='mean')
df_cleaned = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# r. Error correcting - Outlier detection using IQR
def remove_outliers_iqr(dataframe, cols):
    for col in cols:
        Q1 = dataframe[col].quantile(0.25)
        Q3 = dataframe[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        dataframe = dataframe[(dataframe[col] >= lower) & (dataframe[col] <= upper)]
    return dataframe

outlier_cols = ["AGE", "BILIRUBIN", "ALK_PHOSPHATE", "SGOT", "ALBUMIN", "PROTIME"]
df_no_outliers = remove_outliers_iqr(df_cleaned, outlier_cols)

# s. Data transformation: Standard scaling
features = df_no_outliers.drop("Class", axis=1)
target = df_no_outliers["Class"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, target, test_size=0.3, random_state=42)

# t. Build Models

# Logistic Regression
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)
log_acc = accuracy_score(y_test, y_pred_log)

# Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
nb_acc = accuracy_score(y_test, y_pred_nb)

# Print accuracy comparison
print("\nLogistic Regression Accuracy:", log_acc)
print("Classification Report:\n", classification_report(y_test, y_pred_log))

print("\nNaive Bayes Accuracy:", nb_acc)
print("Classification Report:\n", classification_report(y_test, y_pred_nb))

# Final conclusion
if log_acc > nb_acc:
    print("\n✅ Logistic Regression performed better.")
elif nb_acc > log_acc:
    print("\n✅ Naive Bayes performed better.")
else:
    print("\n✅ Both models performed equally well.")


Logistic Regression Accuracy: 0.9130434782608695
Classification Report:
               precision    recall  f1-score   support

         1.0       0.00      0.00      0.00         2
         2.0       0.91      1.00      0.95        21

    accuracy                           0.91        23
   macro avg       0.46      0.50      0.48        23
weighted avg       0.83      0.91      0.87        23


Naive Bayes Accuracy: 0.6086956521739131
Classification Report:
               precision    recall  f1-score   support

         1.0       0.11      0.50      0.18         2
         2.0       0.93      0.62      0.74        21

    accuracy                           0.61        23
   macro avg       0.52      0.56      0.46        23
weighted avg       0.86      0.61      0.69        23


✅ Logistic Regression performed better.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
