## 3. Naive Bayes class implemented with NumPy - for continuous/discrete data
The following code contains a naive bayes class that can accept both continuous and discrete data, and a demonstration of its use on a simple dataset.

In [1]:
import numpy as np

In [2]:
class NaiveBayes():
    """Naive bayes class that accepts both continous and discrete data"""
    
    def __init__(self):
        
        self.priors = []
        self.means = []
        self.stds = []
        
        self.train_disc_probs = []
    
    def train(self, cont_data, disc_data, labels):
        """accepts continuous and/or discrete data as a numpy array and array of corresponding 
        integer class labels starting at zero, pass [] if only training one of discrete/continuous data"""
        
        #train model based on discreate and/or continous normally distributed data
        if len(cont_data) != 0:
            self.train_cont(cont_data, labels)
        if len(disc_data) != 0:
            self.train_disc(disc_data, labels)
    
    def train_cont(self, data, labels):
        """to be called by the 'train' method to train the model on the continuous data features,
        accepts np array of data and array of interger labels starting at zero,
        finds the mean and sd for each variable in each class"""
        
        n = len(data)
        classes = np.unique(labels)
        
        priors = []
        
        #calculate mean and std for each class
        for c in classes:
            group = data[labels == c]
            self.means.append(group.mean(axis=0))
            self.stds.append(group.std(axis=0))
            priors.append(np.log(len(group) / n))
        
        #priors only necessary if they havn't already been calculated by train_disc method
        if self.priors == []:
            self.priors.extend(priors)
        return

    def predict_cont(self, test_data):
        """returns probaility based on gaussian dist. of a data point belonging to each class"""
        
        probs = []
        
        #calculates the log probability of each test point belonging to each class
        for x in test_data:
            for i in range(len(self.priors)):  #len(self.priors) gives number of classes
                p = (1/(self.stds[i]*((2*np.pi)**(1/2))))*np.exp(-0.5*(((x - self.means[i])/self.stds[i])**2))
                probs.append(sum(np.log(p)))
            
        #returns num_data points by num_classes array of probabilities
        cont_probs = np.array(probs).reshape(len(test_data), len(self.priors))  
        return cont_probs
    
    def train_disc(self, data, labels):
        """to be called by 'train' method, accepts numpy array of discrete training data with [1, 0] values,
        and an array of integer class labels starting at zero, 
        finds probility of each variable being true(1) given class"""
        
        n = len(data)
        classes = np.unique(labels)
        
        priors = []
        for c in classes:
            group = data[labels == c]
            
            #sum of true(1) counts in each class
            tally = group.sum(axis=0)
            
            #calculate probability of dimention being true(1) given class with plus one smoothing
            train_prob = (tally + 1) / (len(group) + 2)
            self.train_disc_probs.append(train_prob)
            priors.append(np.log(len(group / n)))
        
        #only necessary if not already calculated by train_cont method
        if self.priors == []:
            self.priors.extend(priors)
        return
        
    def predict_disc(self, test_data):
        """returns probability of a data point belonging to each class"""
        
        probs = []
        for x in test_data:
            for i in range(len(self.priors)):  #len(self.priors) gives number of classes
                
                #get probabilities for corresponding true(1), false(0) values in test point
                on = self.train_disc_probs[i][x == 1]
                off = self.train_disc_probs[i][x == 0]
                
                #if-elif incase all dimentions are true(1)/false(0)
                #sums the log probabilities for each class
                if on.size == 0:
                    p = sum(np.log(1 - off))
                elif off.size == 0:
                    p = sum(np.log(on))
                else:
                    p = sum(np.log(on)) + sum(np.log(1 - off))
                probs.append(p)
        
        #returns num_data points by num_classes array of probabilities
        disc_probs = np.array(probs).reshape(len(test_data), len(self.priors))
        return disc_probs
    
    def predict(self, cont_test_data, disc_test_data):
        """accepts numpy array of data without labels for discrete and/or continuos data ,
        pass [] if not requiring both"""
        
        #get predictions for data containing discrete and/or continuous data
        if len(cont_test_data) != 0:
            cont_probs = self.predict_cont(cont_test_data)
            predictions = np.argmax((cont_probs + self.priors), axis=1)
        elif len(disc_test_data) != 0:
            disc_probs = self.predict_disc(disc_test_data)
            predictions = np.argmax((disc_probs + self.priors), axis=1)
        else:
            cont_probs = self.predict_cont(cont_test_data)
            disc_probs = self.predict_disc(disc_test_data)
            predictions = np.argmax((cont_probs + disc_probs + self.priors), axis=1)
        
        return predictions
                
    
    
    def accuracy(self, predictions, test_labels):
        acc = sum(predictions == test_labels) / len(test_labels)
        
        return print('{0:.2f}'.format(acc * 100) + '%')


# test 1
### only  continuous data
data prep

In [3]:
import pandas as pd

In [4]:
iris_data = pd.read_csv('iris.csv', names=['sepal l', 'sepal w', 'petal l', 'petal w', 'class'], index_col=False)

In [5]:
train_sample = iris_data.sample(120)

In [6]:
a = set(iris_data.index)
b = set(train_sample.index)
c = a.difference(b)

test_sample = iris_data.reindex(c)

In [7]:
train_sample

Unnamed: 0,sepal l,sepal w,petal l,petal w,class
106,4.9,2.5,4.5,1.7,Iris-virginica
30,4.8,3.1,1.6,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
16,5.4,3.9,1.3,0.4,Iris-setosa
54,6.5,2.8,4.6,1.5,Iris-versicolor
...,...,...,...,...,...
94,5.6,2.7,4.2,1.3,Iris-versicolor
55,5.7,2.8,4.5,1.3,Iris-versicolor
93,5.0,2.3,3.3,1.0,Iris-versicolor
33,5.5,4.2,1.4,0.2,Iris-setosa


In [8]:
train_data = train_sample.iloc[:, :-1].to_numpy()

In [9]:
train_labels = train_sample.iloc[:, -1]

In [10]:
mapping = {'Iris-setosa': 0, 'Iris-virginica': 1, 'Iris-versicolor': 2}

In [11]:
train_labels = train_labels.map(mapping).to_numpy()

In [12]:
test_data = test_sample.iloc[:, :-1].to_numpy()

In [13]:
test_labels = test_sample.iloc[:, -1].map(mapping).to_numpy()

##### model test

In [14]:
model = NaiveBayes()

In [15]:
model.train(train_data, [], train_labels)

In [16]:
preds = model.predict(test_data, [])

In [17]:
model.accuracy(preds, test_labels)

93.33%


# test 2
### continuous and discrete data
data prep

In [18]:
len(iris_data[iris_data['class'] == 'Iris-setosa'])

50

In [19]:
len(iris_data[iris_data['class'] == 'Iris-versicolor'])

50

In [20]:
len(iris_data[iris_data['class'] == 'Iris-virginica'])

50

##### adding discrete data that shouldn't affect outcome 
will give: 1st class 75% 1, 2nd class 50% 1 and 3rd class 25% 1, so that each class has a different proportion of 1s by which to differentiate them from each other.

In [21]:
d = np.array([1, 1, 1, 0])
e = np.array([0, 0, 1, 1])
f = np.array([0, 0, 0, 1])

In [22]:
high = np.random.choice(d, 50, replace=True)

In [23]:
mid = np.random.choice(e, 50, replace=True)

In [24]:
low = np.random.choice(f, 50, replace=True)

In [25]:
discrete = np.hstack((high, mid, low))

In [26]:
iris_data['disc'] = discrete

In [27]:
iris_data = iris_data.iloc[:, [0, 1, 2, 3, 5, 4]]

In [28]:
iris_data.head() #same data with added discrete dimention

Unnamed: 0,sepal l,sepal w,petal l,petal w,disc,class
0,5.1,3.5,1.4,0.2,1,Iris-setosa
1,4.9,3.0,1.4,0.2,1,Iris-setosa
2,4.7,3.2,1.3,0.2,1,Iris-setosa
3,4.6,3.1,1.5,0.2,1,Iris-setosa
4,5.0,3.6,1.4,0.2,1,Iris-setosa


In [29]:
train_sample = iris_data.sample(120)

In [30]:
a = set(iris_data.index)
b = set(train_sample.index)
c = a.difference(b)

test_sample = iris_data.reindex(c)

In [31]:
train_data_cont = train_sample.iloc[:, :-2].to_numpy()

In [32]:
train_data_disc = train_sample.iloc[:, -2].to_numpy()

In [33]:
train_labels = train_sample.iloc[:, -1]

In [34]:
mapping = {'Iris-setosa': 0, 'Iris-virginica': 1, 'Iris-versicolor': 2}

In [35]:
train_labels = train_labels.map(mapping).to_numpy()

In [36]:
test_data_cont = test_sample.iloc[:, :-2].to_numpy()

In [37]:
test_data_disc = test_sample.iloc[:, -2].to_numpy()

In [38]:
test_labels = test_sample.iloc[:, -1].map(mapping).to_numpy()

##### model test
When given both discrete and continuous data the model still works as expected

In [39]:
model = NaiveBayes()

In [40]:
model.train(train_data_cont, train_data_disc, train_labels)

In [41]:
pred = model.predict(test_data_cont, test_data_disc)

In [42]:
model.accuracy(pred, test_labels)

100.00%
