In [4]:
import numpy as np
import pandas as pd
import random

In [None]:
class Naive_Bayes:
    def __init__(self, data_set):
        self.ds = data_set
        # Tính trung bình và phương sai của từng đặc trưng theo từng lớp
        self.ds_means = self.ds.groupby('letter').mean()
        self.ds_variances = self.ds.groupby('letter').var()
        self.class_probabilities = self.get_class_probabilities(self.ds)

    def get_class_probabilities(self, data_set):
        class_sizes = data_set.groupby('letter').size()
        ds_total = data_set.shape[0]
        probs = {}
        for label, count in class_sizes.items():
            probs[label] = count / ds_total
        return probs

    def get_probability_density(self, x, mean, variance):
        pd = 1 / (np.sqrt(2 * np.pi * variance)) * np.exp((-(x - mean)**2) / (2 * variance))
        return pd

    def predict(self, x):
        feature_class_probabilities = {}
        for group, class_prob in self.class_probabilities.items():
            feature_class_probabilities[group] = class_prob
            for i in range(len(x)):
                mean = self.ds_means.loc[group][i]
                var = self.ds_variances.loc[group][i]
                feature_class_probabilities[group] *= self.get_probability_density(x[i], mean, var)
        return max(feature_class_probabilities, key=feature_class_probabilities.get)

    def test(self, test_data):
        correct = 0
        total = 0
        for row in test_data.itertuples(index=False):
            feature_set = row[1:]  # bỏ cột đầu tiên là nhãn
            true_label = row[0]
            prediction = self.predict(feature_set)
            if prediction == true_label:
                correct += 1
            else:
                print(feature_set, "prediction=", prediction, "correct=", true_label)
            total += 1
        accuracy = correct / total
        print("Accuracy =", accuracy)

In [6]:
# DATA
data_names = [
    'letter', 'x-box', 'y-box', 'width', 'height', 'onpix', 'x-bar',
    'y-bar', 'x2bar', 'y2bar', 'xybar', 'x2ybr', 'xy2br', 'x-ege', 'xegvy',
    'y-ege', 'yegvx'
]
df = pd.read_csv("letter-recognition.data", names=data_names)
data_set = df.values.tolist()

# Trộn dữ liệu
df = df.sample(frac=1).reset_index(drop=True)

# Tách dữ liệu thành tập huấn luyện và kiểm tra
split_index = int(0.8 * len(df))
train_data = df[:split_index]
test_data = df[split_index:]

# Huấn luyện và đánh giá mô hình
nb = Naive_Bayes(train_data)
nb.test(test_data)

  mean = self.ds_means.loc[group][i]
  var = self.ds_variances.loc[group][i]


(2, 5, 4, 3, 2, 7, 8, 2, 8, 11, 7, 9, 2, 9, 4, 8) prediction= X correct= E
(3, 4, 5, 6, 4, 9, 12, 3, 2, 8, 8, 7, 3, 10, 6, 7) prediction= V correct= H
(7, 13, 6, 7, 4, 7, 8, 2, 7, 11, 6, 7, 3, 8, 5, 5) prediction= X correct= T
(3, 5, 3, 4, 2, 8, 7, 7, 5, 8, 5, 7, 2, 9, 9, 8) prediction= B correct= S
(5, 9, 6, 5, 3, 10, 3, 4, 6, 12, 4, 10, 3, 8, 7, 10) prediction= J correct= Z
(5, 10, 4, 5, 3, 6, 7, 6, 3, 9, 7, 9, 5, 10, 5, 8) prediction= G correct= O
(4, 5, 5, 5, 6, 7, 7, 5, 4, 7, 7, 7, 5, 8, 10, 10) prediction= B correct= S
(5, 6, 6, 8, 9, 9, 8, 6, 3, 7, 7, 8, 6, 10, 7, 4) prediction= W correct= Y
(5, 8, 5, 6, 4, 7, 9, 7, 4, 10, 7, 6, 4, 9, 4, 8) prediction= P correct= O
(10, 14, 8, 8, 4, 8, 10, 6, 5, 6, 10, 5, 6, 13, 4, 7) prediction= Y correct= V
(5, 6, 7, 4, 3, 5, 9, 3, 5, 11, 9, 9, 5, 8, 1, 7) prediction= U correct= N
(6, 10, 8, 7, 8, 8, 8, 7, 5, 7, 5, 8, 5, 9, 7, 12) prediction= B correct= R
(6, 10, 8, 8, 9, 8, 7, 7, 5, 6, 5, 8, 10, 8, 10, 12) prediction= B correct= M
(5, 9, 6, 7