In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
mushroom=pd.read_csv('mushrooms.csv')

In [3]:
mushroom.head(5)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [4]:
encoder=LabelEncoder()

In [5]:
encoder.fit_transform(["paris", "paris", "tokyo", "amsterdam"])

array([1, 1, 2, 0])

In [6]:
list(encoder.classes_)

['amsterdam', 'paris', 'tokyo']

In [7]:
encoder.fit_transform(["tokyo", "tokyo", "paris"])

array([1, 1, 0])

In [8]:
list(encoder.inverse_transform([0, 1, 1]))

['paris', 'tokyo', 'tokyo']

In [9]:
mushroom_encode=mushroom.apply(encoder.fit_transform,axis=0)

In [10]:
mushroom_encode.head(5)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [11]:
mushroom_encode.values

array([[1, 5, 2, ..., 2, 3, 5],
       [0, 5, 2, ..., 3, 2, 1],
       [0, 0, 2, ..., 3, 2, 3],
       ...,
       [0, 2, 2, ..., 0, 1, 2],
       [1, 3, 3, ..., 7, 4, 2],
       [0, 5, 2, ..., 4, 1, 2]])

In [12]:
mushroom_encode.values.shape

(8124, 23)

In [13]:
mushroom_np=mushroom_encode.values # Turning everithing to a numpy array

In [14]:
X=mushroom_np[:,1:]

In [15]:
y=mushroom_np[:,0]

In [16]:
y.shape

(8124,)

In [17]:
X.shape

(8124, 22)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42)

In [19]:
len(X_train), len(X_test)

(6499, 1625)

In [20]:
len(y_train), len(y_test)

(6499, 1625)

In [21]:
len(X_test)

1625

In [22]:
len(y_test)

1625

$$p(A|B)=\frac{p(B|A)p(A)}{p(B)}$$
+ P(A) = the prior, is the initial degree of belief in A.
+ P(A|B) = the posterior, is the degree of belief after incorporating news
that B is true.
+ P(B|A) = the likelihood, that can be estimated from the training data.


In [23]:
def prior(y_train,label):# p(y_train == label)=probability that y_train has label 0/1
    total=y_train.shape[0]
    true_labels=np.sum(y_train == label)
    return true_labels/total

In [24]:
prior(y_train,0)+prior(y_train,1)

1.0

In [25]:
def likelihood(X_train, y_train,col_num,col_val,label): # p(X|y_train == label)=p(x intersect y_train == label)/p(y_train == label)
        X_filter=X_train[y_train == label]
        num=np.sum(X_filter[:,col_num]==col_val)
        denum=X_filter.shape[0]
        
        return num/denum
    

In [26]:
likelihood(X_train,y_train,2,4,1)

0.2635609444798979

In [27]:
X_filter=X_train[y_train == 1]
X_filter[:,1]==2

array([ True,  True, False, ...,  True, False,  True])

In [28]:
def predict(X_train,y_train, X_test):
    classes=np.unique(y_train)
    number_feat=X_train.shape[1]
    post=[]
    for l in classes:
        like=1.0
        for i in range(number_feat):
            #temp=prior(y_train,l)*likelihood(X_train,y_train,i,X_train[i],l)#/(X_train.shape[0])
            g=likelihood(X_train,y_train,i,X_test[i],l)
            like=like*g
            #d.append(temp)
        pri=prior(y_train,l)
        ps=pri*like
        
        post.append(ps)
        pred=np.argmax(post)
    return pred

In [29]:
predict(X_train,y_train,X_test[1])

1

In [30]:
X_test[:,1]

array([0, 2, 3, ..., 3, 3, 0])

In [31]:
def accuracy(X_train,y_train,X_test,y_test):
    pred=[]
    for i in range(X_test.shape[0]):
        p=predict(X_train,y_train,X_test)
        pred.append(p)
        
    y_pred=np.array(pred)
    acc=np.sum(y_pred == y_test)/(y_test.shape[0])
    
    return acc

In [32]:
accuracy(X_train,y_train,X_test,y_test)

  This is separate from the ipykernel package so we can avoid doing imports until


0.5187692307692308