# Naive Bayes Algorithm

In [1]:
import pandas as pd
import requests
import io

url="https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data"
s=requests.get(url).content
credit=pd.read_csv(io.StringIO(s.decode('utf-8')), header=None)

In [2]:
credit.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [12]:
credit.shape

(653, 16)

In [16]:
credit[15].value_counts()

-    357
+    296
Name: 15, dtype: int64

### data preprocessing

In [4]:
import numpy as np
credit=credit.replace('?', np.nan)
credit=credit.dropna()
print(credit.shape)
credit.head(5)

(653, 16)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [5]:
credit.dtypes

0      object
1      object
2     float64
3      object
4      object
5      object
6      object
7     float64
8      object
9      object
10      int64
11     object
12     object
13     object
14      int64
15     object
dtype: object

#### some types are not proper. we do type casting

In [6]:
credit.iloc[:,1]= credit.iloc[:,1].astype('float64')
credit.iloc[:,2]= credit.iloc[:,2].astype('float64')
credit.iloc[:,7]= credit.iloc[:,7].astype('float64')
credit.iloc[:,10]= credit.iloc[:,10].astype('float64')
credit.iloc[:,13]= credit.iloc[:,13].astype('float64')
credit.iloc[:,14]= credit.iloc[:,14].astype('float64')

credit.dtypes

0      object
1     float64
2     float64
3      object
4      object
5      object
6      object
7     float64
8      object
9      object
10    float64
11     object
12     object
13    float64
14    float64
15     object
dtype: object

In [7]:
credit.reset_index(inplace = True, drop=True)
credit.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1.0,f,g,202.0,0.0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6.0,f,g,43.0,560.0,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0.0,f,g,280.0,824.0,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5.0,t,g,100.0,3.0,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0.0,f,s,120.0,0.0,+


##  Task: 
Design **NB algorithm** to predict the class of following sample where the sample must be predicted as negative (-)

In [8]:
x= credit.iloc[99,:]
x

0         b
1     27.83
2         4
3         y
4         p
5         i
6         h
7      5.75
8         t
9         t
10        2
11        t
12        g
13       75
14        0
15        -
Name: 99, dtype: object

## First we need to design it
Probability of that x belongs to Class-i (either + or -)is compıuted as follows (assuming equal class distribution), Naive Bayes Classifier, where i points to class + and class - in order

$$ P(C=i| x) = \Pi_{j=1}^{d} p(x_{j}| C=i)$$

For categorical variable, probability of P(x-j == value | C=i) is simply computed as # x-j == value / len( C-i subset). However, for numerical varibale, we need to compute mean and the standard deviation of the variable-j first under C-i subset. Further we can compute how likely a given numeric value occurs in the subset distribution of variable-j (x-j) given C-i as follows.

$$ P(x_{j}= value| C_{i})= \frac{1}{\sqrt{2\pi \sigma_{ij}^2}}  \epsilon^{- \frac{ (value -\mu_{ij})^2}{2 \sigma_{ij}^{2} }}$$
   

In [9]:
def Naive_Bayes (X, Y, X_test, Y_test):
    import numpy as np
    
    Y_prob = {}
    Y_count = {}
    Y_unique = np.unique(Y)
    Y_len = len(Y)
    for C in range(len(Y_unique)):
        Y_prob[Y_unique[C]] = sum (Y == Y_unique[C]) / Y_len
        Y_count[Y_unique[C]] = sum (Y == Y_unique[C])
    
    
    prob_table = {}
    for no_feature in range(X.shape[1]):
        s = 'Column_{}'.format(X.columns.values[no_feature])
        if (X.iloc[:,no_feature].dtypes != 'float64'):# categorical
            X_unique = np.unique(X.iloc[:,no_feature])
            prob_table [s] = pd.DataFrame(0.00, index= X_unique, columns= Y_unique) #create empty dataframe
            for i in range(len(X_unique)):
                for j in range(len(Y_unique)):
                    prob_table [s].iloc[i,j] = sum(X[Y==Y_unique[j]].iloc[:,no_feature] == X_unique[i]) / Y_count[Y_unique[j]]

    
    # prediction
    test_nrow = X_test.shape [0]
    test_ncol = X_test.shape [1]
    # creating an empty dataframe for prediction
    ##               +     -   Final_Pred
    ## instance1   #.##  #.##    + or -
    ## instance2   #.##  #.##    + or -
    ## ...
    pred_table = pd.DataFrame(0.00, index= range(len(X_test)), columns= np.append(Y_unique,"Final_pred")) 
    pred_table["Final_pred"] = pred_table["Final_pred"].astype(Y.dtypes) # same type as output
    for i in range(test_nrow):
        pred = {}
        first_time = True
        for j in range(test_ncol):
            if (X.iloc[:,j].dtypes != 'float64'):# categorical
                s = 'Column_{}'.format(X_test.columns.values[j])
                for C in range(len(Y_unique)):
                    temp = prob_table[s].loc[X_test.iloc[i,j]][C]
                    if first_time: 
                        pred[C] = temp
                    else:
                        pred[C] = pred[C] * temp
                first_time = False
            
            else: #numerical
                for C in range(len(Y_unique)):
                    filtered = X[Y==Y_unique[C]].iloc[:,j]
                    mean_filtered = np.mean(filtered)
                    var_filtered = np.std(filtered)**2
                    temp = 1 / (2*np.pi*var_filtered)**(0.5) * np.exp ( - (X_test.iloc[i,j] - mean_filtered)**2 / 
                                                                      (2*var_filtered)  )
                    if first_time: 
                        pred [C] = temp
                    else:
                        pred [C] = pred[C] * temp
                first_time = False
                    

        for C in range(len(Y_unique)):
            pred[C] = pred[C] * Y_prob[Y_unique[C]]
        denom = sum(pred.values())
        for C in range(len(Y_unique)):
            pred[C] = pred[C] / denom
    
        for C in range(len(Y_unique)):
            pred_table.iloc[i,C] = round(pred[C],3)
            
        pred_table.iloc[i, pred_table.columns.get_loc("Final_pred")] = pred_table.iloc[i,0:2].astype('float64').idxmax()
        
    pred_table["True_labels"] = Y_test
    output = {}
    output["prob_table"] = prob_table
    output["pred"] = pred_table
    
    accuracy = sum(pred_table["Final_pred"] == Y_test) / len(Y_test)
    print("accuracy =", round(accuracy,3))
    return (output)

In [10]:
np.random.seed(1)
mask = np.random.rand(len(credit)) < 0.8
train = credit[mask].reset_index(drop=True)
test = credit[~mask].reset_index(drop=True)

X_train = train.iloc[:,0:15]
Y_train = train[15]
X_test = test.iloc[:,0:15]
Y_test = test[15]


Naive_Bayes (X_train, Y_train, X_test,Y_test)['pred'].head()

accuracy = 0.82


Unnamed: 0,+,-,Final_pred,True_labels
0,0.211,0.789,-,+
1,1.0,0.0,+,+
2,0.003,0.997,-,+
3,0.553,0.447,+,+
4,0.211,0.789,-,+
