# Naive Bayes Implementation

In [1]:
!pip install category_encoders==2.5.1

You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [2]:
import pandas as pd 
import numpy as np 
import math
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce

In [3]:
class NB:
    def fit(self, X, y):
        s, f = X.shape
        self.cla = np.unique(y)
        c = len(self.cla)
        self.prior = np.zeros(c, dtype=np.float64)
        self.mean_val = np.zeros((c, f), dtype=np.float64)
        self.varr = np.zeros((c, f), dtype=np.float64)
        for i, j in enumerate(self.cla):
            alterX = X[y == j]
            self.prior[i] = alterX.shape[0] / float(s)
            self.varr[i, :] = alterX.var(axis=0)
            self.mean_val[i, :] = alterX.mean(axis=0)

    def predict(self, X):
        y_pred = [self.pred(x) for x in X]
        return y_pred

    def pred(self, x):
        post = []
        for i, j in enumerate(self.cla):
            posterior = np.sum(np.log(self.pad(i, x)))
            prior = np.log(self.prior[i])
            posterior = prior + posterior
            post.append(posterior)
        return self.cla[np.argmax(post)]

    def pad(self, i, x):
        var = self.varr[i]
        mean = self.mean_val[i]
        nu = np.exp(-((x - mean) ** 2) / (2 * var))
        den = np.sqrt(2 * np.pi * var)
        return nu / den

## Breast Cancer Dataset

In [4]:
bcd = pd.read_csv('https://raw.githubusercontent.com/meetgojiya98/Machine-Learning-Fall-22/main/Programming%20Project/Datasets/breast-cancer-wisconsin.data', sep=',', header=None)

In [5]:
bcd.columns = ['Sample code', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
                'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
                'Normal Nucleoli', 'Mitoses','Class']
bcd = bcd.drop(['Sample code'],axis=1)
bcd = bcd.replace('?',np.NaN)
bcd_ = bcd['Bare Nuclei']
bcd_ = bcd_.fillna(bcd_.median())
bcd_ = bcd.dropna()

In [6]:
x = bcd_.iloc[:,:-1].values.astype(int)
y = bcd_.iloc[:,-1].values.astype(int)

In [7]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

def variance(x):
    n = len(x)
    mean = sum(x) / n
    deviations = [(x - mean) ** 2 for x in x]
    variance = sum(deviations) / n
    return variance

def standard_Deviation(x):
    var = variance(x)
    print(var)
    std_dev = math.sqrt(var)
    return std_dev

cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=None)
scores=[]

for i, j in cv.split(x,y):
    X_train, X_test = x[i], x[j]
    y_train, y_test = y[i], y[j]
    nb = NB()
    nb.fit(X_train, y_train)
    predictions = nb.predict(X_test)
    score=accuracy(y_test, predictions)
    scores.append(score)
print('Accuracy: %.2f%%' % (sum(scores)/float(len(scores))))
print('Standard Deviation: % s' % (standard_Deviation(scores)))

Accuracy: 0.96%
0.00021910488951357464
Standard Deviation: 0.014802192050962407
  posterior = np.sum(np.log(self.pad(i, x)))


In [8]:
# # assign data of lists.  
# dataRow = {'Name': ['1', '4', '5', '7'], 'Age': [20, 21, 19, 18]}  
  
# # Create DataFrame  
# dfff = pd.DataFrame(dataRow)  
  
# # Print the output.  
# print(dfff)  

# print(nb.predict(dfff))

## Car Dataset

In [9]:
cd = pd.read_csv('https://raw.githubusercontent.com/meetgojiya98/Machine-Learning-Fall-22/main/Programming%20Project/Datasets/car.data', sep=',', header=0)

In [10]:
cols = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
cd.columns = cols
X = cd.drop(['class'], axis=1)
y = cd['class']
cd = cd.dropna()
cd.reset_index(drop=True, inplace=True)

In [11]:
encoder_X = ce.OrdinalEncoder(cols=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety'])
x= encoder_X.fit_transform(X)

encoder_Y = ce.OrdinalEncoder()
y = np.ravel(encoder_Y.fit_transform(y))

In [12]:
x = pd.DataFrame(x)
y = pd.DataFrame(y)
y.columns=['class']
x = x.values.astype(int)
y = y.iloc[:,0].values.astype(int)

In [13]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy


cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=123)
scores=[]  

for i, j in cv.split(x,y):
    X_train, X_test = x[i], x[j]
    y_train, y_test = y[i], y[j]
    nb = NB()
    nb.fit(X_train, y_train)
    predictions = nb.predict(X_test)
    score=accuracy(y_test, predictions)
    scores.append(score)

print('Accuracy: %.2f%%' % (sum(scores)/float(len(scores))))
print('Standard Deviation: % s' % (standard_Deviation(scores)))

  nu = np.exp(-((x - mean) ** 2) / (2 * var))
  return nu / den
  nu = np.exp(-((x - mean) ** 2) / (2 * var))
Accuracy: 0.04%
9.503516226939076e-05
Standard Deviation: 0.009748597964291622


## Mushroom Dataset

In [14]:
md =  pd.read_csv("https://raw.githubusercontent.com/meetgojiya98/Machine-Learning-Fall-22/main/Programming%20Project/Datasets/agaricus-lepiota.data", sep=',', header=None)


In [15]:
md.columns = ['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat']
col=['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat']

In [16]:
labelencoder=LabelEncoder()
for column in md.columns:
    md[column] = labelencoder.fit_transform(md[column])

In [17]:
x = md.iloc[:,1:].values.astype(int)
y= md.iloc[:,0].values.astype(int)

In [18]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy * 100


cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=3)
scores=[]

for i, j in cv.split(x,y):
    X_train, X_test = x[i], x[j]
    y_train, y_test = y[i], y[j]
    nb = NB()
    nb.fit(X_train, y_train)
    predictions = nb.predict(X_test)
    score=accuracy(y_test, predictions)
    scores.append(score)

print('Accuracy: %.2f%%' % (sum(scores)/float(len(scores))))
print('Standard Deviation: % s' % (standard_Deviation(scores)))

  nu = np.exp(-((x - mean) ** 2) / (2 * var))
  posterior = np.sum(np.log(self.pad(i, x)))
Accuracy: 51.80%
1.1332724654828517
Standard Deviation: 1.064552706766016


## Ecoli Dataset

In [19]:
ed =  pd.read_csv("https://raw.githubusercontent.com/meetgojiya98/Machine-Learning-Fall-22/main/Programming%20Project/Datasets/ecoli.data",header=None,sep="\s+")
col_names = ["squence_name","mcg","gvh","lip","chg","aac","alm1","alm2","site"]
ed.columns = col_names

In [20]:
ed.loc[:,ed.dtypes == "object"].columns.tolist()

['squence_name', 'site']

In [21]:
def cleaning_object(ed,cols_to_drop,class_col):
    ed = ed.drop(cols_to_drop,axis=1)
    uni_class = ed[class_col].unique().tolist()
    for class_label in uni_class:
        num_rows = sum(ed[class_col] == class_label)
        if num_rows < 10:
            class_todrop = ed[ed[class_col] == class_label].index
            ed.drop(class_todrop,inplace = True)
    return ed

In [22]:
ced = cleaning_object(ed,["squence_name",'lip','chg'],"site")

In [23]:
ced["site"].value_counts()

cp     143
im      77
pp      52
imU     35
om      20
Name: site, dtype: int64

In [24]:
encoder_Y = ce.OrdinalEncoder()
y=np.ravel(encoder_Y.fit_transform(ced["site"]))

In [25]:
ced=ced.drop('site',axis=1,)
X = ced.values.astype(float)

In [26]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=123)
scores=[]

for i, j in cv.split(X,y):
    X_train, X_test = X[i], X[j]
    y_train, y_test = y[i], y[j]
    nb = NB()
    nb.fit(X_train, y_train)
    predictions = nb.predict(X_test)
    score=accuracy(y_test, predictions)
    scores.append(score)

print('Accuracy: %.2f%%' % (sum(scores)/float(len(scores))))
print('Standard Deviation: % s' % (standard_Deviation(scores)))

Accuracy: 0.88%
0.0015669084388804664
Standard Deviation: 0.03958419430631962


## Letter Recognition Dataset

In [27]:
ld =  pd.read_csv("https://raw.githubusercontent.com/meetgojiya98/Machine-Learning-Fall-22/main/Programming%20Project/Datasets/letter-recognition.data",header=None)
col_names = ["letter","xbox","ybox","width","height","onpix","xbar","ybar","x2bar","y2bar","xybar","x2ybar","xy2bar","xedge","xedgey","yedge","yedgex"]
ld.columns = col_names

In [28]:
X = ld.iloc[:, 1:]
y = ld['letter'].tolist()
x = X.values.astype(float)


In [29]:
encoder_Y = ce.OrdinalEncoder()
y=np.ravel(encoder_Y.fit_transform(y))

In [30]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=1)
scores=[] 

for i, j in cv.split(x,y):
    X_train, X_test = x[i], x[j]
    y_train, y_test = y[i], y[j]
    nb = NB()
    nb.fit(X_train, y_train)
    predictions = nb.predict(X_test)
    score=accuracy(y_test, predictions)
    scores.append(score)

print('Accuracy: %.2f%%' % (sum(scores)/float(len(scores))))
print('Standard Deviation: % s' % (standard_Deviation(scores)))

Accuracy: 0.64%
7.948500000000006e-05
Standard Deviation: 0.00891543605215135


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=622b4942-d89f-4437-a464-f16e2e4df832' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>