In [163]:
import os 
import numpy as np

def loadDataSet():
    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him','my'],
                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = ['a','b','a','b','a','b']    # 1 is abusive, 0 not
    return postingList,classVec

In [164]:
class NaiveBayes:
    def __init__(self):
        self.vocabulary = []     # 存储词汇表
        self.vocabulary_length = 0
        self.tf = 0
        self.idf = 0
        self.tdm = {}             # 存储 P(x|yi) 
        self.labels = []
        self.doc_length = 0
        self.Pcategory = {}
        
    def fit(self,DataSet,labels):
        self.doc_length = len(DataSet)
        vocabulary_set = set()
        [vocabulary_set.add(word) for doc in DataSet for word in doc]
        self.vocabulary = list(vocabulary_set)
        print(self.vocabulary)
        self.vocabulary_length = len(self.vocabulary)
        self.labels = labels
        self.CalcPcategory()
        self.CalcWordFreq(DataSet)
        self.Build_tdm()
    
    def CalcPcategory(self):
        category_set = set(self.labels)
        for label in category_set:
            self.Pcategory[label] = self.labels.count(label)/len(self.labels)
    
    def CalcWordFreq(self,DataSet):
        self.tf = np.zeros([self.doc_length,self.vocabulary_length])
        self.idf = np.zeros([1,self.vocabulary_length])
        
        for i in range(self.doc_length):
            for j in range(self.vocabulary_length):
                self.tf[i,j] = (DataSet[i].count(self.vocabulary[j]) + 0.1)/(len(DataSet[i]) + 0.1)
            for word in set(DataSet[i]):
                self.idf[0,self.vocabulary.index(word)] += 1
                
        self.idf = np.log(float(self.doc_length)/self.idf)
        self.tf = np.multiply(self.tf,self.idf)  # 矩阵与向量的点乘 tf x idf
    
    def Build_tdm(self):
        sumtf ={}
        for label in self.Pcategory:
            self.tdm[label] = np.zeros([1,self.vocabulary_length])
            sumtf[label] = np.zeros([1,1])
        for i in range(self.doc_length):
            self.tdm[self.labels[i]] += self.tf[i] # 将同一类别的词向量空间值加总
            # 统计每个分类的总值--是个标量
            sumtf[self.labels[i]]= np.sum(self.tdm[self.labels[i]])
        for label in self.tdm:
            self.tdm[label] /= sumtf[label]
        
    def predict(self,TestSet):
        test_tf = np.zeros([len(TestSet),self.vocabulary_length])
        for i in range(len(TestSet)):
            for j in range(self.vocabulary_length):
                test_tf[i,j] = TestSet[i].count(self.vocabulary[j])
       
        predclass = [] # 初始化类别名称
        for i in range(len(test_tf)):
            predvalue = -float('inf') # 初始化类别概率 
            predclass_temp = ''
            for keyclass in self.Pcategory.keys():
                # P(x|yi) P(yi)
                temp = np.sum(test_tf[i]*np.log(self.tdm[keyclass]) + np.log(self.Pcategory[keyclass])) # 变量tdm，计算最大分类值
                if temp > predvalue:
                    predvalue = temp
                    predclass_temp = keyclass
            predclass.append(predclass_temp)
        return predclass

In [165]:
dataset, labels = loadDataSet()
print(dataset)
print(labels)

[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him', 'my'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
['a', 'b', 'a', 'b', 'a', 'b']


In [166]:
nb = NaiveBayes()
nb.fit(dataset,labels)

['stop', 'please', 'so', 'buying', 'steak', 'dog', 'garbage', 'problems', 'how', 'flea', 'ate', 'has', 'help', 'him', 'park', 'not', 'take', 'worthless', 'dalmation', 'my', 'cute', 'licks', 'food', 'posting', 'is', 'maybe', 'mr', 'I', 'to', 'stupid', 'love', 'quit']


In [167]:
a = nb.predict(dataset)
print(a)

['a', 'b', 'a', 'b', 'a', 'b']
