## Read and understand the data

In [4]:
import pandas as pd

data = pd.read_csv('dataset/data.csv')

In [5]:
data

Unnamed: 0,cap-shape=b,cap-shape=c,cap-shape=f,cap-shape=k,cap-shape=s,cap-shape=x,cap-surface=f,cap-surface=g,cap-surface=s,cap-surface=y,...,population=v,population=y,habitat=d,habitat=g,habitat=l,habitat=m,habitat=p,habitat=u,habitat=w,class
0,F,F,F,F,F,T,F,F,T,F,...,F,F,F,F,F,F,F,T,F,B
1,F,F,F,F,F,T,F,F,T,F,...,F,F,F,T,F,F,F,F,F,A
2,T,F,F,F,F,F,F,F,T,F,...,F,F,F,F,F,T,F,F,F,A
3,F,F,F,F,F,T,F,F,F,T,...,F,F,F,F,F,F,F,T,F,B
4,F,F,F,F,F,T,F,F,T,F,...,F,F,F,T,F,F,F,F,F,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,F,F,F,T,F,F,F,F,T,F,...,F,F,F,F,T,F,F,F,F,A
8120,F,F,F,F,F,T,F,F,T,F,...,T,F,F,F,T,F,F,F,F,A
8121,F,F,T,F,F,F,F,F,T,F,...,F,F,F,F,T,F,F,F,F,A
8122,F,F,F,T,F,F,F,F,F,T,...,T,F,F,F,T,F,F,F,F,B


## Sklearn naive_bayes does not support categorical values in X 

In [6]:
from sklearn.preprocessing import LabelEncoder

X = data.drop(columns=['class'])
y = data['class']
label_encoder = LabelEncoder()
for column in X.columns:
    if X[column].dtype == 'object' and len(X[column].unique()) == 2:
        X[column] = label_encoder.fit_transform(X[column])

## Use Naive Bayes algorithm for binary classification. Use stratified 10-folds cross-validation to measure the performance of the algorithm.
## 10-folds means in each iteration of cross-validation, the model is trained on 9*(dataset_size/10) samples and rest will be using on test

## To measure performance metrics I will store TP, TN, FP, FN in each stratified k-fold iteration

In [17]:
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB


nb_classifier = GaussianNB()


# use n_splits = 10 for 10-folds
# Initialize stratified 10-fold cross-validation
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

total_acc ,total_prec ,total_rec ,total_tnrate ,total_f1 = 0,0,0,0,0
# Iterate over each fold
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train Naive Bayes classifier
    nb_classifier.fit(X_train, y_train)
    
    # Predict on test data
    y_pred = nb_classifier.predict(X_test)
    
    # Calculate tn, fp, fn, tp
    tn, fp, fn, tp = 0,0,0,0

    # tp means the model predicted the positive and the actual label was also positive
    # our positive class is A
    tp = sum((y_pred == 'A')&(y_test == 'A'))
    
    # tn means the model predicted the negative and the actual label was also negative
    # our negative class is B
    tn = sum((y_pred == 'B')&(y_test == 'B'))

    # fp means the model predicted the positive and the actual label was negative
    fp = sum((y_pred == 'A')&(y_test == 'B'))

    # fn means the model predicted the negative and the actual label was positive
    fn = sum((y_pred == 'B')&(y_test == 'A'))

    print(tp,tn,fp,fn)

    # # Calculate evaluation metrics
    # accuracy = (tp + tn) / (tp + tn + fp + fn)
    # precision = tp / (tp + fp) if (tp + fp) 
    # recall = tp / (tp + fn) if (tp + fn) 
    # tn_rate = tn / (tn + fp) if (tn + fp) 
    # f1 = 2 * (precision * recall) / (precision + recall)

    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn) 
    tn_rate = tn / (tn + fp)
    f1 = 2 * (precision * recall) / (precision + recall)

    # I am using k-fold therefore I need total values of the metrics
    total_acc = accuracy + total_acc
    total_prec = precision + total_prec
    total_rec = recall + total_rec
    total_tnrate = tn_rate + total_tnrate
    total_f1 = f1 + total_f1

# Calculate average metrics
avg_acc = total_acc / 10
avg_prec = total_prec / 10
avg_rec = total_rec / 10
avg_tnrate = total_tnrate / 10
avg_f1 = total_f1 / 10

print(avg_acc,avg_prec,avg_rec,avg_tnrate,avg_f1)

376 392 0 45
369 392 0 52
370 392 0 51
382 392 0 39
376 392 0 44
371 392 0 49
383 389 2 38
375 391 0 46
376 391 0 45
377 390 1 44
