In [1]:
import numpy as np
from collections import Counter

In [2]:
class NaiveBayes(object):
    
    def __init__(self, alpha=1):                                #alpha：拉普拉斯平滑 常数
        self.alpha = alpha
        self.word_list = []
        self.p_yk = {}                                            #先验概率
        self.p_xy = {}                                            #后验概率 k:类别，v:该类别词频
        
        
    def fit(self, data, label):
        word_set = set([])
        for document in data:
            word_set = word_set | set(document)
        self.word_list = list(word_set)                             #准备词库
        
        data_matrix = []
        wordList_len = len(self.word_list)
        for doc in data:
            data_matrix.append(self.__word2Vec(doc,wordList_len))  #准备词库矩阵
        #计算先验概率，初始化后验概率分子中拉普拉斯项
        obj = Counter(label)
        label_len = len(label)
        for k, v in obj.items():
            self.p_yk[k] = (v + self.alpha) / label_len + len(obj) * self.alpha
            self.p_xy[k] = np.zeros(wordList_len) + self.alpha
        #计算后验概率         
        for i,item in enumerate(data_matrix):
            self.p_xy[label[i]] +=  item                         #统计各类别各词频，
        for k,item in self.p_xy.items():
            item /= obj[k] + self.alpha
          

        
    def predict(self, input_word):
        word_vec = self.__word2Vec(input_word, len(self.word_list))
        res_dict = {}
        for k,v in self.p_xy.items():
            p_XiYk = self.p_xy[k] * word_vec
            p_XiYk = p_XiYk[p_XiYk!=0].prod(axis=0)
            res_dict[k] = self.p_yk[k] * p_XiYk
        return max(res_dict,key=res_dict.get)
        
    
    def __word2Vec(self, input_word, wordList_len):
        return_vec = [0]*wordList_len
        for word in input_word:
            if word in self.word_list:
                return_vec[self.word_list.index(word)] = 1
        return return_vec


In [3]:
data = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
        ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
        ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
        ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
        ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
        ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid'],
        ['shit','fuck']]
label = [0, 1, 0, 1, 0, 1, 1]  #1代表脏话
# 这里省略停用词过滤等数据预处理过程

test_positiveData = ['love', 'my', 'dalmation'] # 测试数据
test_negtiveData = ['he', 'is', 'stupid','garbage']

nb = NaiveBayes()
nb.fit(data,label)

res0 = '0(positive)' if nb.predict(test_positiveData) == 0 else '1(negtive)'
res1 = '0(positive)' if nb.predict(test_negtiveData) == 0 else '1(negtive)'

print(test_positiveData,'classified as:',res0,)
print(test_negtiveData,'classified as:',res1)

['love', 'my', 'dalmation'] classified as: 0(positive)
['he', 'is', 'stupid', 'garbage'] classified as: 1(negtive)
