In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Naive Bayes --- Mushroom Classifier

#### Load the Dataset

In [2]:
data = pd.read_csv('../dataset/mushrooms.csv')

In [3]:
print(data.shape)

(8124, 23)


In [4]:
print(data.isnull().values.sum())

0


In [5]:
data.head(3)

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m


#### converting the categorical data into numerical value

In [6]:
from sklearn.preprocessing import LabelEncoder

In [7]:
le = LabelEncoder()

In [8]:
numerical = data.apply(le.fit_transform)

In [9]:
print(numerical.shape)

(8124, 23)


In [10]:
numerical.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [11]:
x = numerical.values[:, 1:]
y = numerical.values[:, 0]

In [12]:
print(x.shape)
print(y.shape)

(8124, 22)
(8124,)


####  train test split

In [13]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.40)

In [14]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(4874, 22) (4874,)
(3250, 22) (3250,)


In [15]:
print(x[:5, :])

[[5 2 4 1 6 1 0 1 4 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [5 2 9 1 0 1 0 0 4 0 2 2 2 7 7 0 2 1 4 3 2 1]
 [0 2 8 1 3 1 0 0 5 0 2 2 2 7 7 0 2 1 4 3 2 3]
 [5 3 8 1 6 1 0 1 5 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [5 2 3 0 5 1 1 0 4 1 3 2 2 7 7 0 2 1 0 3 0 1]]


#### Building our classifier

In [16]:
def prior_prob(y, label):
    total_classes = y.shape[0]
    total_labels = np.sum(y == label)
    return (total_labels)/float(total_classes)

def conditional_prob(x, y, fcol, fval,  label): #fcol - feautre_col  #fval-feautre_value
#     m = x.shape[0]
#     total_labels = np.sum(y == label)
#     print(total_labels)
#     total = 0.0
#     for i in range(m):
#         if x[i][fcol] == fval and y[i] == label:
#             total += 1
    
#     return total/total_labels
    # short cut to this is
    x_filtered = x[y == label]
    numerator = np.sum(x_filtered[:, fcol] == fval )
    denominator = np.sum(y == label)
    return numerator/denominator

def predict(x, y, test):
    labels = np.unique(y)
    post_prob = []
    m = test.shape[0]
    
    for label in labels:
        
        likelihood = 1.0
        
        for fcol in range(m):
            likelihood *= conditional_prob(x, y, fcol, test[fcol], label)
            
        post_prob.append(likelihood*prior_prob(y, label))    
        
    pred = np.argmax(post_prob)
    return pred
        


In [17]:
prior_prob(np.array([0, 0, 1]), 0)

0.6666666666666666

In [18]:
conditional_prob(np.array([[0, 0, 1], [0, 1, 1], [1, 0, 1], [1, 1, 1], [1, 1, 1]]), np.array([0, 1, 1, 1, 0]), 0, 0, 0)

0.5

In [19]:
predict(x_train, y_train, x_test[52])

1

In [20]:
x_test[52]

array([5, 3, 9, 0, 2, 1, 0, 0, 3, 0, 1, 1, 1, 4, 0, 0, 2, 1, 2, 1, 4, 0])

In [21]:
y_test[52]

1

In [22]:
predictions = []
for each in x_test:
    predictions.append(predict(x_train, y_train, each))

In [23]:
import sklearn.metrics as metrics

In [24]:
metrics.accuracy_score(predictions, y_test)

0.9975384615384615

### Gaussian Naive Bayes -- Handling Continuous Data

In [109]:
from sklearn.datasets import load_breast_cancer

In [110]:
cancer = load_breast_cancer()

In [112]:
cancer.target_names

array(['malignant', 'benign'], dtype='<U9')

In [113]:
x = cancer.data
y = cancer.target
y = np.reshape(y, (-1, 1))

In [114]:
print(x.shape)
print(y.shape)

(569, 30)
(569, 1)


In [115]:
combined = np.hstack((x, y))

In [116]:
np.random.shuffle(combined)

### split the data

In [117]:
from sklearn.model_selection import train_test_split

In [118]:
x_train, x_test, y_train, y_test = train_test_split(combined[:, :-1], combined[:, -1], test_size = 0.2)

In [224]:
def likelihood(x, y, label, test):
    lh = 1.0
    mean = np.mean(x[y==label], axis = 0)
    var = np.var(x[y==label], axis = 0)
    m = test.shape[0]
    for i in range(m):
        ft = 1/(2*np.pi*var[i])
        st = np.e**(-(test[i] - mean[i])**2)/(2*var[i])
        lh *= (ft*st)
                
    return lh  

def predictGNBC(x_train, y_train, test):
    labels = np.unique(y_train)
    post_prob = []
    m = test.shape[0]
    
    for label in labels:
        lh = likelihood(x_train, y_train, label, test)
        pp = prior_prob(y_train, label)
        prob = lh*pp
        post_prob.append(prob)
        print(lh)
    print(post_prob)
    return np.argmax(post_prob)
    

In [225]:
likelihood(x_train, y_train, y_train[0], x_test[12])

0.0

In [226]:
np.var(x_train[:, 23][y_train==1])

27363.933201072257

In [227]:
np.mean(x_train[y_train == y_train[0]], axis = 0)

array([1.22111203e+01, 1.78002062e+01, 7.85109278e+01, 4.67574570e+02,
       9.22724742e-02, 8.03916838e-02, 4.61341158e-02, 2.60678454e-02,
       1.73440893e-01, 6.27085567e-02, 2.82579038e-01, 1.20736873e+00,
       1.99487491e+00, 2.11492543e+01, 7.15265636e-03, 2.14781821e-02,
       2.57401945e-02, 9.89169072e-03, 2.04818866e-02, 3.56755430e-03,
       1.34496426e+01, 2.32622680e+01, 8.74275258e+01, 5.64773883e+02,
       1.24483058e-01, 1.82057904e-01, 1.65833052e-01, 7.47501684e-02,
       2.69187285e-01, 7.90149141e-02])

In [228]:
x_train[:, 25]

array([0.2878 , 0.4061 , 0.2394 , 0.2436 , 0.2068 , 0.225  , 0.3903 ,
       0.8681 , 0.2364 , 0.1792 , 0.07158, 0.2548 , 0.7394 , 0.0739 ,
       0.1979 , 0.4099 , 0.2187 , 0.2658 , 0.1361 , 0.295  , 0.1017 ,
       0.1108 , 0.05332, 0.09515, 0.05213, 0.4233 , 0.1115 , 0.1202 ,
       0.284  , 0.205  , 0.261  , 0.1949 , 0.1472 , 0.1364 , 0.2942 ,
       0.06885, 0.1766 , 0.6164 , 0.09473, 0.5634 , 0.2405 , 0.1352 ,
       0.2053 , 0.4116 , 0.09605, 0.1936 , 0.09148, 0.5717 , 0.3885 ,
       0.1507 , 0.4365 , 0.1633 , 0.146  , 0.2167 , 0.09794, 0.1087 ,
       0.1637 , 0.255  , 0.3593 , 0.1397 , 0.165  , 0.2031 , 0.1626 ,
       0.1352 , 0.09708, 0.4126 , 0.2809 , 0.1892 , 0.3898 , 0.1049 ,
       0.2236 , 0.4503 , 0.3856 , 0.1398 , 0.2531 , 0.3539 , 0.04953,
       0.09358, 0.3454 , 0.255  , 0.3861 , 0.1044 , 0.3568 , 0.1788 ,
       0.06744, 0.4186 , 0.1239 , 0.1147 , 0.1266 , 0.04619, 0.3663 ,
       0.1958 , 0.2979 , 0.1064 , 0.3835 , 0.1477 , 0.6076 , 0.1678 ,
       0.1402 , 0.74

In [229]:
np.e

2.718281828459045

In [230]:
a = x_train[:, 22
           ][y_train==1]

In [231]:
a

array([ 88.14,  62.56,  88.81,  89.  ,  64.48,  82.66,  66.5 ,  97.17,
        82.74,  93.63,  85.09,  94.22, 109.8 ,  73.07,  96.53,  79.26,
        81.6 ,  65.74,  85.08,  84.16,  50.41,  93.22, 114.3 ,  88.54,
        94.52,  78.44, 102.9 ,  68.81,  86.54,  88.87,  91.63,  78.  ,
        78.27,  65.5 ,  83.9 , 100.4 , 105.8 ,  82.08,  80.92, 104.6 ,
        76.51,  95.23,  75.4 ,  73.23,  86.04,  81.23,  71.79,  79.29,
        79.73,  88.91,  80.78,  83.12,  77.98,  91.99,  84.48,  82.14,
        90.67, 113.1 ,  70.76, 102.8 ,  84.11,  59.9 ,  87.4 ,  86.97,
        84.46,  69.47,  89.69, 112.5 ,  88.1 ,  86.57,  65.59,  71.98,
        87.38,  69.1 ,  64.01,  97.19,  57.26,  89.02,  59.16, 103.1 ,
        96.74,  88.12,  97.58, 107.1 , 102.5 ,  82.98, 114.1 ,  86.  ,
        72.62,  97.67,  84.35,  62.86,  99.71, 101.1 ,  54.49, 105.9 ,
       107.  , 100.9 ,  98.4 ,  85.13,  70.89,  81.25,  79.12,  67.88,
        96.59, 104.5 ,  62.25,  86.43,  85.07,  96.66,  68.62,  90.24,
      

In [232]:
np.sum(a)/a.shape[0]

87.42752577319588