## 樸素貝氏分類

In [None]:
from sklearn.feature_extraction.text import CountVectorizer 
# sklearn.feature_extraction.text 提供文字特徵抽取工具

from sklearn.naive_bayes import MultinomialNB
# sklearn.naive_bayes 提供多項式樸素貝氏分類器

In [None]:
# 垃圾郵件資料集
spam_emails = [
    "Get a free gift card now!",
    "Limited time offer: Claim your prize!",
    "You have won a free iPhone!",
]

# 正常郵件資料集
ham_emails = [
    "Meeting rescheduled for tomorrow",
    "Can we discuss the report later?",
    "Thank you for your prompt reply",
]

# 建立資料集和對應的標籤, 1: 垃圾郵件, 0: 正常郵件
emails = spam_emails + ham_emails # 合併垃圾郵件和正常郵件
labels = [1] * len(spam_emails) + [0] * len(ham_emails) # 1: 垃圾郵件, 0: 正常郵件


In [None]:
# 使用CountVectorizer來轉換郵件內容為數字矩陣
vectorizer = CountVectorizer()
features = vectorizer.fit_transform(emails)
# print(features)

# 使用Multinomial Naive Bayes模型來訓練
classifier = MultinomialNB()
classifier.fit(features, labels)

print(features.toarray()) # 轉換後的數字矩陣
# print(classifier.predict(features)) # 預測結果

[[0 1 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 1]
 [0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0]
 [1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 1 1]]


In [4]:

# 測試分類器
print("測試分類器與結果")
test_email = ["Claim your free gift now"]
test_features = vectorizer.transform(test_email)
print(f"郵件: '{test_email[0]}' 分類結果 : \
{'垃圾郵件' if classifier.predict(test_features)[0] else '非垃圾郵件'}")


測試分類器與結果
郵件: 'Claim your free gift now' 分類結果 : 垃圾郵件


In [5]:

test_email = ["Can we discuss your decision tomorrow"]
test_features = vectorizer.transform(test_email)
print(f"郵件: '{test_email[0]}' 分類結果 : \
{'垃圾郵件' if classifier.predict(test_features)[0] else '非垃圾郵件'}")



郵件: 'Can we discuss your decision tomorrow' 分類結果 : 非垃圾郵件
