In [1]:
import numpy as np
import numbers
from sklearn.datasets import load_iris

In [2]:
class NavieBayes:
    def __init__(self):
        self.X = None
        self.y = None
        self.size = None
        self.classes = set()
        self.dic_discrete = {}
        self.dic_continous = {}
    
    def fit(self, X, y):
        self.X = X
        self.y = y
        assert type(X) == type(np.asarray([])), f"X should be a numpy array, not {type(X)}"
        assert type(y) == type(np.asarray([])), f"y should be a numpy array, not {type(y)}"
        assert len(X) == len(y), "X and y should have same data size"
        self.size = len(X)
        self.classes = set(y)
       
        # store data in discrete or continous dic
        for i, x in enumerate(X):
            cls = y[i]
            for feature_idx, feature in enumerate(x):
                feature_type = self.get_type(feature)
                if feature_type == 'discrete':
                    self.dic_discrete.setdefault(cls, {}) \
                    .setdefault(feature_idx, []).append(feature)
                elif feature_type == 'continous':
                    self.dic_continous.setdefault(cls, {}) \
                    .setdefault(feature_idx, []).append(feature)
        
        # calculate mean and standard deviation for continous type
        for _, dic in self.dic_continous.items():
            for k, v in dic.items():
                miu = np.mean(v)
                std = np.sqrt(np.sum((np.asarray(v) - miu)**2) / (self.size - 1))
                dic[k] = (miu, std)
    
    def get_type(self, feature):
        if type(feature) == 'str' or type(feature) == 'bool':
            return 'discrete'
        elif isinstance(feature, numbers.Number):
            return 'continous'
        else:
            assert False, f'Error, the feature type is {type(feature)}, not supported!'
                
    def cal_prob(self, cls, i, v, feature_type):
        if feature_type == 'discrete':
            arr = self.dic_discrete[cls][i]
            if np.sum(arr == v) == 0:
                prob = 1 / (len(arr) + len(set(arr)) + 1)
            else:
                prob = np.sum(arr == v) / np.sum(arr)
        elif feature_type == 'continous':
            miu, std = self.dic_continous[cls][i][0], self.dic_continous[cls][i][1]
            prob = 1 / np.sqrt(2 * np.pi * std**2) * np.exp(-(v - miu)**2/(2*std**2))
        
        return prob
    
    def predict(self, x):
        scores = []
        for cls in self.classes:
            score = 1
            for idx, feature in enumerate(x):
                feature_type = self.get_type(feature)
                score *= self.cal_prob(cls, idx, feature, feature_type)
            scores.append((cls, score))
        return sorted(scores, key=lambda x: x[1], reverse=True)[0][0]

In [3]:
iris = load_iris()
X, y = iris.data, iris.target
print(X.shape, y.shape)

(150, 4) (150,)


In [4]:
model = NavieBayes()
model.fit(X, y)

In [5]:
model.predict(X[11])

0