In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 1: Load data
url = "https://raw.githubusercontent.com/thieu1995/csv-files/main/data/pandas/hepatitis.csv"
df = pd.read_csv(url)

# Step 2: Impute missing values
num_cols = df.select_dtypes(include=[np.number]).columns
cat_cols = df.select_dtypes(exclude=[np.number]).columns.drop('class')  # bỏ cột target

imputer_num = SimpleImputer(strategy='mean')
imputer_cat = SimpleImputer(strategy='most_frequent')

df[num_cols] = imputer_num.fit_transform(df[num_cols])
df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])

print("Số lượng NaN sau khi xử lý:", df.isnull().sum().sum())

# Step 3: Chuẩn bị X, y
X = df.drop(columns=['class'])
y = df['class']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 4: Build pipeline
# - OneHotEncoder cho cột phân loại
# - StandardScaler cho cột numeric
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(drop='first'), cat_cols)
    ]
)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

# Step 5: Train model
pipeline.fit(X_train, y_train)

# Step 6: Predict & Evaluate
y_pred = pipeline.predict(X_test)

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nAccuracy:", accuracy_score(y_test, y_pred))



Số lượng NaN sau khi xử lý: 0

Confusion Matrix:
[[ 3  3]
 [ 2 23]]

Classification Report:
              precision    recall  f1-score   support

         die       0.60      0.50      0.55         6
        live       0.88      0.92      0.90        25

    accuracy                           0.84        31
   macro avg       0.74      0.71      0.72        31
weighted avg       0.83      0.84      0.83        31


Accuracy: 0.8387096774193549
