**Import the necessary libraries**

In [None]:
from sklearn import datasets
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB


**Data Exploration** 

In [None]:
iris = datasets.load_iris()
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [None]:
iris

 'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

<a href="https://imgbb.com/"><img src="https://i.ibb.co/mDC1KSt/petal-sepal.png" alt="petal-sepal" border="0"></a>

In [None]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

<a href="https://ibb.co/8sJhx0R"><img src="https://i.ibb.co/h9kqdyn/iris.png" alt="iris" border="0"></a>

**Loading and splitting the dataset into train and testsets**

In [None]:
len(iris.target)

150

In [None]:
iris = datasets.load_iris()
# Please fill in the below function which can be used to split the data in 80:20 ratio and return 4 numpy arrays- train_x, train_y, test_x, test_y
# shapes- train_x: (0.8*len(iris.data),4), train_y : (0.8*len(iris.data),),test_x:(0.2*len(iris.data),4) , test_y:(0.2*len(iris.data),) 
def dataset_splitter(iris):
  
  # space for code
  samples_0=[]
  samples_1=[]
  samples_2=[]
  for i,row in enumerate(iris.data):
    if iris.target[i]==0:    
      samples_0.append(list(row))
    elif iris.target[i]==1:
      samples_1.append(list(row))
    elif iris.target[i]==2:
      samples_2.append(list(row))
  train_x=(np.array(samples_0[:40]+samples_1[:40]+samples_2[:40]))
  train_y=np.array([0]*40+[1]*40+[2]*40)
  test_x=np.array(samples_0[40:]+samples_1[40:]+samples_2[40:])
  test_y=np.array([0]*10+[1]*10+[2]*10)
  
  return (train_x, train_y, test_x, test_y)

**Function to calculate mean and variance**

In [None]:
# This function returns the mean and variance across all dimensions of the input
# returns: 1.mean : shape- (4,) 2.variance : shape- (4,) 
def mean_var_calculator(data):
  
  # space for code
  mean = np.mean(data,axis=0)
  var  = np.var(data,axis=0)
  
  return (mean, var)

**Calculating log posterior**

\begin{equation}
Posterior(X|\sigma_c^{2},\mu_c,class label=C)\sim\Pi_{i=1}^{4} \frac{1}{\sqrt{2\pi\sigma_c^2}}e^{\frac{-(x_i-\mu_c)^2}{2\sigma_c^2}}.P(C)
\end{equation}

In [None]:
# this function takes 3 parameters as input- 
#1. data sample for which the posterior is to be calculated
#2. Mean, Variance of the gaussian w.r.t which posterior is to be estimated
#Assume a Uniform Prior on the class labels
#return the log of posterior calculated- shape: X.shape[0]
def calc_log_posterior(x,mean,var):
  
  # space for code
  prior = np.log(0.33)
  likelihood = -0.5 * np.sum(np.log(2. * np.pi * (var ** 0.5))) - 0.5 * np.sum(((x - mean)/var) ** 2, 0) 
  posterior = prior + likelihood
  
  return (posterior)

   **Making predictions using the model that we have**


In [None]:
#inputs: 1. x- datapoint to which the prediction is to be calculated
#        2. set_0 - a tuple of mean and variance for training samples belonging to class:0
#           set_1 - a tuple of mean and variance for training samples belonging to class:1
#           set_2 - a tuple of mean and variance for training samples belonging to class:2   
#        3. class_label: integer in range:{0,1,2}
def predict(x, set_0, set_1, set_2):
  
  # space for code
  if calc_log_posterior(x,set_0[0],set_0[1])==max(calc_log_posterior(x,set_0[0],set_0[1]), calc_log_posterior(x,set_1[0],set_1[1]), calc_log_posterior(x,set_2[0],set_2[1])):
    class_label = (0)
  elif calc_log_posterior(x,set_1[0],set_1[1])==max(calc_log_posterior(x,set_0[0],set_0[1]), calc_log_posterior(x,set_1[0],set_1[1]), calc_log_posterior(x,set_2[0],set_2[1])):
    class_label = (1)
  else:
    class_label = (2)
  
  # corrected
  # class_label = np.argmax([calc_log_posterior(x,set_0[0],set_0[1]), calc_log_posterior(x,set_1[0],set_1[1]), calc_log_posterior(x,set_2[0],set_2[1])])

  return (class_label)

In [None]:
train_x, train_y, test_x, test_y = dataset_splitter(iris)
samples_0=[]
samples_1=[]
samples_2=[]
for i,row in enumerate(train_x):
  if train_y[i]==0:    
    samples_0.append(list(row))
  elif train_y[i]==1:
    samples_1.append(list(row))
  elif train_y[i]==2:
    samples_2.append(list(row))
samples_0 = (np.array(samples_0),np.array([0]*len(samples_0)))  # data with only samples from class:0
samples_1 = (np.array(samples_1),np.array([1]*len(samples_1)))  # data with only samples from class:1
samples_2 = (np.array(samples_2),np.array([2]*len(samples_2)))  # data with only samples from class:2

set_0, set_1, set_2 = mean_var_calculator(samples_0[0]), mean_var_calculator(samples_1[0]), mean_var_calculator(samples_2[0])

y_pred=[]

for i,row in enumerate(test_x):
  y_pred.append(predict(row, set_0, set_1, set_2))
y_pred=np.array(y_pred)
print ('accuracy of the model is {}'.format((y_pred==test_y).mean()))

accuracy of the model is 1.0


**Comparison with builtin model**

In [None]:
model = GaussianNB()

**Steps:** 
  1. fit the model on the train_x
  2. Make predictions using inbuilt functions and print the accuracy

In [None]:
model.fit(train_x,train_y)
predictions = model.predict(test_x)

correct_predictions = (test_y==predictions).sum()
accuracy = correct_predictions/float(len(test_x))
print ('accuracy of using inbuilt model is {}'.format(accuracy))

accuracy of using inbuilt model is 1.0
