## 1. Import thư viện và nạp dữ liệu vào notebook

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

df = pd.read_csv("mushrooms.csv")
df.head()


Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


## 2. Xử lý dữ liệu

In [3]:
target_col = "class"

print("Shape:", df.shape)
print("\nTarget counts:\n", df[target_col].value_counts())
print("\nMissing values:\n", df.isnull().sum())


Shape: (8124, 23)

Target counts:
 class
e    4208
p    3916
Name: count, dtype: int64

Missing values:
 class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64


## 3. Mã hóa dữ liệu

In [4]:
X = df.drop(columns=[target_col])
y = df[target_col]

le = LabelEncoder()
y_enc = le.fit_transform(y)

oe = OrdinalEncoder(dtype=np.int64)
X_enc = oe.fit_transform(X)

print("Encoded shape:", X_enc.shape)


Encoded shape: (8124, 22)


## 4. Chia tập train/test và xây dựng mô hình

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X_enc, y_enc, test_size=0.2, random_state=42, stratify=y_enc
)

model = CategoricalNB()
model.fit(X_train, y_train)

print("Model trained.")


Model trained.


## 5. Đánh giá mô hình

In [6]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=le.classes_))

print("Confusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.9458461538461539

Classification Report:

              precision    recall  f1-score   support

           e       0.91      0.99      0.95       842
           p       0.99      0.90      0.94       783

    accuracy                           0.95      1625
   macro avg       0.95      0.94      0.95      1625
weighted avg       0.95      0.95      0.95      1625

Confusion Matrix:

[[835   7]
 [ 81 702]]


## 6. Cross-Validation để đánh giá độ ổn định

In [7]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_enc, y_enc, cv=cv, scoring='accuracy')

print("Cross-Validation scores:", scores)
print("Mean CV:", scores.mean())


Cross-Validation scores: [0.95015385 0.95876923 0.95569231 0.94092308 0.95812808]
Mean CV: 0.9527333080712392
