In [201]:
import pandas as pd
import numpy as np
import math
import random
import matplotlib.pyplot as plt

In [202]:
df = pd.read_csv('iris.csv')

**Investigating the dataset**

In [203]:
df.head(10)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
5,5.4,3.9,1.7,0.4,Setosa
6,4.6,3.4,1.4,0.3,Setosa
7,5.0,3.4,1.5,0.2,Setosa
8,4.4,2.9,1.4,0.2,Setosa
9,4.9,3.1,1.5,0.1,Setosa


In [204]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal.length    150 non-null float64
sepal.width     150 non-null float64
petal.length    150 non-null float64
petal.width     150 non-null float64
variety         150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.3+ KB


**replacing catagorical data**

In [205]:
df_copy = df.copy()

In [206]:
replace_cat_data = {'variety':{'Setosa':1,'Versicolor':2,'Virginica':3}}

In [207]:
df_copy.replace(replace_cat_data,inplace=True)

In [208]:
df_copy.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,1
1,4.9,3.0,1.4,0.2,1
2,4.7,3.2,1.3,0.2,1
3,4.6,3.1,1.5,0.2,1
4,5.0,3.6,1.4,0.2,1


In [209]:
data = df_copy.values #creating a np matrix with dataframe

**splitting the data set into two part(training set and test set),split ratio is given by the user(in this example i am using 67% split ratio)**

In [210]:
def loadDataSet(training_data,test_data,split):
    for i in range(len(data)):
        dataset = data[i,:]
        if random.random()<split:
            training_data.append(dataset)
        else:
            test_data.append(dataset)

In [211]:
#testing function
split_ratio=0.67
training_data = []
test_data = []

loadDataSet(training_data,test_data,split_ratio)
training_set = np.asarray(training_data)
test = np.asarray(test_data)
print(training_set.shape)
print(test.shape)

(100, 5)
(50, 5)


**selecting feature by class**


In [212]:
#parameter(data,class label)
import pdb
def seperateFeatureByClass(data,k):
    x,y = data.shape
    seperated = {}
    count=0
    #initialize dictionay
    for i in range(0,y-1):
        seperated[i]=[]
    #pdb.set_trace()
    for i in range(0,len(data)):
        if int(data[i,4])==k:
            count+=1
            for j in range(0,y-1):
                seperated[j].append(data[i,j])
    
    seperated[y-1] = count#this is redundant (please delete this)
    return seperated  
    

In [213]:
#testing function
#x = seperateFeatureByClass(training_set,1)
x = {}
for i in range(0,3):
    x[i] = seperateFeatureByClass(training_set,i+1)


**calculate the mean and standard deviation of classes**

In [220]:
def calculateParameter(feature):

    parameter = np.empty([2,len(feature)])
    
    for j in range(0,len(feature)-1):
        mean = np.mean(feature[j])
        std = np.std(feature[j])
        parameter[0][j]=mean
        parameter[1][j]=std
    
    parameter[0][len(feature)-1] = feature[len(feature)-1]#this is redundant
    
    return parameter
    

In [275]:
#testing function
#par = calculateParameter(x[0])
y = np.zeros((3,2,5))
for i in range(0,3):
    y[i] = calculateParameter(x[i])
y[0]


array([[ 4.98684211,  3.40263158,  1.44210526,  0.23684211, 38.        ],
       [ 0.31470208,  0.3535436 ,  0.16800937,  0.0929659 ,  0.1       ]])

**Gaussian function to calculate the probability density**

In [216]:
def calculateProbability(x,mean,std):
    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(std,2))))
    return (1/(math.sqrt(2*math.pi)*std))*exponent

In [217]:
#testing function (ignore this)
calculateProbability(data[0,0],par[0][0],par[0][1])

0.11718040134771306

**calculate the conditional probability(eg: P(x=sepal.length|class=setosa))**

In [280]:
def probabilityByClass(data,parameter):
    probability = 1
    for i in range(0,len(data)-1):
        probability *= calculateProbability(data[i],parameter[0][i],parameter[1][i])
    
    return probability

In [292]:
#testing function
p_list = []
p_list.append(probabilityByClass(training_set[40],y[0]))
p_list.append(probabilityByClass(training_set[40],y[1]))
p_list.append(probabilityByClass(training_set[40],y[2]))
p_list.index(max(p_list))

1

In [296]:
#calculating the accuracy of the classifier
def getAccuracy(prediction,test):
    count = 0
    for i in range(0,len(test)):
        if prediction[i] == int(test[i,4]):
            count+=1
    
    return (count/len(test))*100

In [317]:
def main():
    split_ratio=0.67
    training_data = []
    test_data = []
    a = 3#number of class
    b = 2#number of parameter
    c = 5#number of feature
    

    loadDataSet(training_data,test_data,split_ratio)
    training_set = np.asarray(training_data)
    test = np.asarray(test_data)
    #splitting feature by class
    feature = {}
    for i in range(0,a):
        feature[i] = seperateFeatureByClass(training_set,i+1)
    #extract mean and std of each feature by class
    parameterByClass = np.zeros((a,b,c))
    for i in range(0,a):
        parameterByClass[i] = calculateParameter(feature[i])
    
    prediction = []
    for i in range(0,len(test)):
        probabilityByClassList = []
        for j in range(0,a):
            probabilityByClassList.append(probabilityByClass(test[i],y[j]))
        index = probabilityByClassList.index(max(probabilityByClassList))
        prediction.append(index+1)
        
    print("Accuracy of the Classifier is: "+str(getAccuracy(prediction,test))+"%")
    #return prediction
main()    

Accuracy of the Classifier is: 95.83333333333334%
