In [28]:
"""
@ Author:         21140678-姜楠（小组成员：姜楠(21140678)、王勃栋(21140701)、李甜(21140684)）
@ Create Date:    2021-12-01
@ Requirements:
1, Implmenting the naïve bayes classifier without using sk-learn.
2, Implementation need support both discrete and continuous features, and Gaussian, Multinomial, and Bernoulli models required.
3, Providing prediction probability is required 
4, classifier evaluation required.
5, Compare your implementation with Sk-learn regressor.
"""
import pandas as pd
import numpy as np
import math


# 定义gaussion model
class Gaussion:
    def separate_by_classes(self, X, y):
#         分类区分
        self.classes = np.unique(y)
        classes_index = {}
        subdatasets = {}
        cls, counts = np.unique(y, return_counts=True)
        self.class_freq = dict(zip(cls, counts))
#         print(self.class_freq)
        for class_type in self.classes:
            classes_index[class_type] = np.argwhere(y==class_type)
            subdatasets[class_type] = X[classes_index[class_type], :]
            self.class_freq[class_type] = self.class_freq[class_type]/sum(list(self.class_freq.values()))
        return subdatasets
    
    def fit(self, X, y):
        separated_X = self.separate_by_classes(X, y)
        self.means = {}
        self.std = {}
        for class_type in self.classes:
#             计算mean  标准偏离
            self.means[class_type] = np.mean(separated_X[class_type], axis=0)[0]
            self.std[class_type] = np.std(separated_X[class_type], axis=0)[0]
            
    def calculate_probability(self, x, mean, stdev):
        exponent = math.exp(-((x - mean) ** 2 / (2 * stdev ** 2)))
        return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent
    
    def predict_proba(self, X):
        self.class_prob = {cls:math.log(self.class_freq[cls], math.e) for cls in self.classes}
        for cls in self.classes:
            for i in range(len(self.means)):
#                 print(X[i])
                self.class_prob[cls]+=math.log(self.calculate_probability(X[i], self.means[cls][i], self.std[cls][i]), math.e)
        self.class_prob = {cls: math.e**self.class_prob[cls] for cls in self.class_prob}
        return self.class_prob
    
    def predict(self, X):
        pred = []
        for x in X:
            pred_class = None
            max_prob = 0
            for cls, prob in self.predict_proba(x).items():
                if prob>max_prob:
                    max_prob = prob
                    pred_class = cls
            pred.append(pred_class)
        return pred

# 定义Multinomial model
class Multinomial:
    def __init__(self, k=0.5):
        self.k = k
        self.cat0_count = 0
        self.cat1_count = 0
        self.total_count = self.cat0_count + self.cat1_count
        self.cat_0_prior = 0
        self.cat_1_prior = 0
        self.cat_0_prior, self.cat_1_prior
        self.word_probs = []
        self.vocab = []

    def tokenize(self, document):
        doc = document.lower()
        tokens = ""
        for char in doc:
            tokens += char
        return tokens.split()

    def count_words(self, X, y):
        counts = {}
        for document, category in zip(X, y):
            for token in self.tokenize(document):
                if token not in counts:
                    counts[token] = [0,0]
                counts[token][category] += 1
        return counts

    def prior_prob(self, counts):

        cat0_word_count = cat1_word_count = 0
        for word, (cat0_count, cat1_count) in counts.items():
            cat0_word_count += cat0_count
            cat1_word_count += cat1_count

        self.cat0_count = cat0_word_count
        self.cat1_count = cat1_word_count
        self.total_count = self.cat0_count + self.cat1_count

        cat_0_prior = cat0_word_count / self.total_count
        cat_1_prior = cat1_word_count / self.total_count
        return cat_0_prior, cat_1_prior

    def word_probabilities(self, counts):
       
        self.vocab = [word for word, (cat0, cat1) in counts.items()]
        return [(word,
        (cat0 + self.k) / (self.cat0_count + 2 * self.k),
        (cat1 + self.k) / (self.cat1_count + 2 * self.k))
        for word, (cat0, cat1) in counts.items()]

    def fit(self, X, y):
        counts = self.count_words(X, y)
        self.cat_0_prior, self.cat_1_prior = self.prior_prob(counts)
        self.word_probs = self.word_probabilities(counts)

    def predict(self, test_corpus):
        
        y_pred = []
        for document in test_corpus:
            log_prob_cat0 = log_prob_cat1 = 0.0
            tokens = self.tokenize(document)
            
            for word, prob_cat0, prob_cat1 in self.word_probs:
                if word in tokens:
                    log_prob_cat0 += np.log(prob_cat0)
                    log_prob_cat1 += np.log(prob_cat1)
            cat_0_pred = self.cat_0_prior * np.exp(log_prob_cat0)
            cat_1_pred = self.cat_1_prior * np.exp(log_prob_cat1)
            if cat_0_pred >= cat_1_pred:
                y_pred.append(0)
            else:
                y_pred.append(1)
        return y_pred

import re
from math import log
import glob
from collections import Counter

# 定义Bernoulli model
class Bernoulli:

    def __init__(self):
        self._log_priors = None
        self._cond_probs = None
        self.features = None
    
    def fit(self, docs, labels):
       
        label_counts = Counter(labels)
        N = float(sum(label_counts.values()))
        self._log_priors = {k: log(v/N) for k, v in label_counts.items()}
#         print("#####docs",docs)
        # 抽取 features 
        X = [set(self.get_features(d)) for d in docs]
        # get all features
        self.features = set([f for features in X for f in features])

        #log( P(X|Y) )
        #n1 + 1 / (n1 + n2 + 2)
        self._cond_probs = {l: {f: 0. for f in self.features} for l in self._log_priors}

        
        for x, l in zip(X, labels):
            for f in x:
                self._cond_probs[l][f] += 1.

        # 计算 log probs
        for l in self._cond_probs:
            N = label_counts[l]
            self._cond_probs[l] = {f: (v + 1.) / (N + 2.) for f, v in self._cond_probs[l].items()}

    def predict(self, docs):
       
        X = [set(self.get_features(d)) for d in docs]
        y=[]
        for x in X:
            pred_class = None
            max_ = float("-inf")

            # Perform MAP estimation
            for l in self._log_priors:
                log_sum = self._log_priors[l]
                for f in self.features:
                    prob = self._cond_probs[l][f]
                    log_sum += log(prob if f in x else 1. - prob)
                if log_sum > max_:
                    max_ = log_sum
                    pred_class = l
            y.append(pred_class)
        return y

    #       转换特征
    def get_features(self,text):
        
        return set([w.lower() for w in text.split(" ")])
    

In [29]:
#数据集准备
from sklearn import datasets 
from sklearn.model_selection import train_test_split

# gaussian 使用sklearn iris数据集
X, y = datasets.load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
# X_train

In [30]:
# 使用sklearn GaussianNB模型预测
from sklearn.naive_bayes import GaussianNB,BernoulliNB ,MultinomialNB

#GaussianNB
skg=GaussianNB()
skg.fit(X_train,y_train)
skg_pre = skg.predict(X_test)


In [31]:
# 使用编写的 GaussianNB模型预测
g=Gaussion()
g.fit(X_train,y_train)
g_pre = g.predict(X_test)


In [32]:
# multiminol 使用 邮件数据集
from sklearn.feature_extraction.text import CountVectorizer

spam = pd.read_csv("spam.csv")
dummies = pd.get_dummies(spam.label)
spam = pd.concat([spam,dummies],axis="columns")
spam = spam.drop(["label","ham"],axis="columns")
print(spam.groupby("spam").describe())

# print(spam)
m_X_train, m_X_test, m_y_train, m_y_test = train_test_split(spam["text"], spam["spam"], test_size=0.5, random_state=0)
# v=CountVectorizer(analyzer='word',ngram_range=(2,2))
v=CountVectorizer(stop_words='english')

m_X_train_T=v.fit_transform(m_X_train.values)
m_X_test_T=v.transform(m_X_test.values)

# print(m_X_train_T.toarray()[:1])
# v.get_feature_names()


      text                                                               
     count unique                                                top freq
spam                                                                     
0     4825   4516                             Sorry, I'll call later   30
1      747    653  Please call our customer service representativ...    4


In [33]:
# MultinomialNB

skm = MultinomialNB()
skm.fit(m_X_train_T, m_y_train)
skm_pre = skm.predict(m_X_test_T);

# skm.score(m_X_train_T,m_y_test)
# skm_pre.reshape(1,3)
# m_X_test_T.toarray()

In [34]:
#使用自编Multinomial
m=Multinomial()
m.fit(m_X_train, m_y_train)
m_pre = m.predict(m_X_test)
# m_pre


In [35]:
# bernoulli数据集，使用multiminal转换
v=CountVectorizer(stop_words='english', binary=True)

b_X_train_T=v.fit_transform(m_X_train.values).toarray()
b_X_test_T=v.transform(m_X_test.values).toarray()
b_y_train=m_y_train
# print(b_X_train_T)

In [36]:
# skleran 实现，此处使用multiminal数据集
skb = BernoulliNB()
skb.fit(m_X_train_T, b_y_train)
skb_pre = skb.predict(b_X_test_T)


In [39]:
#自编bernuoulli
alpha = 1
b = Bernoulli()
b.fit(m_X_train,m_y_train)
# b.predict(m_X_test)
b_pre = b.predict(m_X_test)


In [40]:
from sklearn import metrics
def evaluate(typename,actual, pred):
    m_accuracy_score = metrics.accuracy_score(actual,pred)
    print(typename,' accuracy_score:{0:.3f}'.format(m_accuracy_score))

# 对比分析
print("sk GaussianNB predict:",skg_pre[:50])
print("自编 Gaussion predict:",t[:50])
evaluate("sk GaussianNB",y_train,skg_pre)
evaluate("自编 GaussianNB",y_train,g_pre)


print('\n')
print("sk MultinomialNB predict:",skm_pre[:50])
print("自编 Multinomial predict:",m_pre[:50])
evaluate("sk MultinomialNB",m_y_train,skm_pre)
evaluate("自编 MultinomialNB",m_y_train,m_pre)

print('\n')
print("sk BernoulliNB predict:",skb_pre[:50])
print("自编 Bernoulli predict:",b_pre[:50])
evaluate("sk BernoulliNB",m_y_train,skb_pre)
evaluate("自编 BernoulliNB",m_y_train,b_pre)


sk GaussianNB predict: [2 1 0 2 0 2 0 1 1 1 1 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 1 1 1 2 0 2 0 0 1 2 2 1 2]
自编 Gaussion predict: [1 1 0 2 0 2 0 2 2 1 2 1 1 1 1 0 1 1 0 0 1 1 0 0 1 0 0 1 1 0 2 1 0 2 2 1 0
 1 1 1 2 0 2 0 0 1 2 2 1 2]
sk GaussianNB  accuracy_score:0.320
自编 GaussianNB  accuracy_score:0.333


sk MultinomialNB predict: [0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0
 0 0 1 0 0 0 0 0 0 0 0 1 0]
自编 Multinomial predict: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
sk MultinomialNB  accuracy_score:0.772
自编 MultinomialNB  accuracy_score:0.763


sk BernoulliNB predict: [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 0 0 0 0 0]
自编 Bernoulli predict: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0