In [1]:
# Group 8
# 23BM6JP05: Akshay K
# 20CS10038: N Surya Prakash Reddy
# Project 1
# Mushroom Classification using Naive Bayes Algorithm

## Imports

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.naive_bayes import CategoricalNB
from sklearn.preprocessing import OrdinalEncoder

# Class Implementation

In [4]:
class NaiveBayesClassifier:
  def __init__(self, alpha=1):
    self.output_probs = {}
    self.conditional_probs = {}
    self.alpha = alpha
    self.train_x = None

  def fit(self, train_x, train_y):
    self.train_x = train_x
    rows = len(train_x)
    output_labels = train_y.unique()

    # Calculating output probabilities
    self.output_probs = {output: len(train_x[y == output]) + self.alpha / (rows + self.alpha * len(output_labels)) for output in output_labels}

    # Calculating conditional probabilities for features given the output
    # using P(Y|X) = (Count(X and Y) + alpha) / (Count(Y) + alpha * Count(X.unique()))
    self.conditional_probs = {
      column: {
        value: {
          output: (len(train_x[(train_x[column] == value) & (y == output)]) + self.alpha) / (len(train_x[y == output]) + self.alpha * len(train_x[column].unique())) 
          for output in output_labels
        }
        for value in train_x[column].unique()
      }
      for column in train_x.columns
    }

    # predicion for the training data
    preds = self.predict(train_x)
    print('Performance on the Training Data: \n', classification_report(train_y, preds))
    print('Accuracy on the Training Data: ', accuracy_score(train_y, preds))

  def predict(self, test_x):
    preds = []
    for _, row in test_x.iterrows():
      calc_probs = [1] * len(self.output_probs)
      
      # Calculating the probability of each label given the input
      # using P(Y|X) ∝ P(X1|Y) * P(X2|Y) * ... * P(Xn|Y) * P(Y)
      for i, label in enumerate(self.output_probs.keys()):
        for column in test_x.columns:
          calc_probs[i] *= self.conditional_probs[column][row[column]][label]
        calc_probs[i] *= self.output_probs[label]
      
      # Return the label with the highest probability
      index = max(enumerate(calc_probs), key=lambda x: x[1])[0]
      preds.append(list(self.output_probs.keys())[index])
    return preds

  def check_zero_probs(self):
    zero_probs = []
    for column in self.train_x.columns:
      for value in self.train_x[column].unique():
        for output in self.conditional_probs[column][value].keys():
          if self.conditional_probs[column][value][output] == 0:
            
            # print('Zero Probability Found: ', column, value, output)
            zero_probs.append({
              'column': column,
              'value': value,
              'output': output
            })

    return zero_probs

# Reading the data from the csv file

Please change the path to the csv file as per your system.

If running on colab - '/content/mushrooms.csv'

In [5]:
data_path = './mushrooms.csv'
mushrooms = pd.read_csv(data_path)

Features and labels

In [6]:
X = mushrooms.drop('class', axis = 1)
y = mushrooms['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

# Result Comparison

## Alpha = 0

In [7]:
nb = NaiveBayesClassifier(alpha=0)
nb.fit(X_train, y_train)

Performance on the Training Data: 
               precision    recall  f1-score   support

           e       1.00      0.99      1.00      3355
           p       0.99      1.00      1.00      3144

    accuracy                           1.00      6499
   macro avg       1.00      1.00      1.00      6499
weighted avg       1.00      1.00      1.00      6499

Accuracy on the Training Data:  0.9970764733035852


Searching for rows in the test data which are unseen in the training data

In [8]:
zero_probs = nb.check_zero_probs()

# Join the testing X and Y to check for zero probs
test = X_test.copy()
test['class'] = y_test

# for each column, value, output in zero_probs:
# check if test has any rows with the same column, value and output
# if so, print the row
for dict in zero_probs:
    column = dict['column']
    value = dict['value']
    output = dict['output']
    print(f'Column: {column}, Value: {value}, Output: {output}')

    # Print number of rows with the same column, value and output in the test set
    print(f'Number of rows in the test set: {len(test[(test[column] == value) & (test["class"] == output)])}\n')

Column: cap-shape, Value: s, Output: p
Number of rows in the test set: 0

Column: cap-shape, Value: c, Output: e
Number of rows in the test set: 0

Column: cap-surface, Value: g, Output: e
Number of rows in the test set: 0

Column: cap-color, Value: u, Output: p
Number of rows in the test set: 0

Column: cap-color, Value: r, Output: p
Number of rows in the test set: 0

Column: odor, Value: f, Output: e
Number of rows in the test set: 0

Column: odor, Value: s, Output: e
Number of rows in the test set: 0

Column: odor, Value: y, Output: e
Number of rows in the test set: 0

Column: odor, Value: p, Output: e
Number of rows in the test set: 0

Column: odor, Value: c, Output: e
Number of rows in the test set: 0

Column: odor, Value: l, Output: p
Number of rows in the test set: 0

Column: odor, Value: a, Output: p
Number of rows in the test set: 0

Column: odor, Value: m, Output: e
Number of rows in the test set: 0

Column: gill-color, Value: b, Output: e
Number of rows in the test set: 0

C

Our classifier

In [9]:
pred = nb.predict(X_test)
print(classification_report(y_test, pred))
print('Accuracy: ', accuracy_score(y_test, pred))

              precision    recall  f1-score   support

           e       1.00      1.00      1.00       853
           p       1.00      1.00      1.00       772

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625

Accuracy:  0.9981538461538462


Sklearn classifier

In [10]:
ord = OrdinalEncoder()
X_train_encoded = ord.fit_transform(X_train)
y_train_encoded = [1 if label == 'p' else 0 for label in y_train]

# Categorical Naive Bayes with alpha = 0
NB = CategoricalNB(alpha=0)
NB.fit(X_train_encoded, y_train_encoded)

X_test_encoded = ord.transform(X_test)
predictions = ['p' if pred == 1 else 'e' for pred in NB.predict(X_test_encoded)]
print(classification_report(y_test, predictions))
print('Accuracy: ', accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           e       1.00      1.00      1.00       853
           p       1.00      1.00      1.00       772

    accuracy                           1.00      1625
   macro avg       1.00      1.00      1.00      1625
weighted avg       1.00      1.00      1.00      1625

Accuracy:  0.9981538461538462


## Alpha = 0.1

In [11]:
nb = NaiveBayesClassifier(alpha=0.1)
nb.fit(X_train, y_train)

Performance on the Training Data: 
               precision    recall  f1-score   support

           e       0.97      0.99      0.98      3355
           p       0.99      0.97      0.98      3144

    accuracy                           0.98      6499
   macro avg       0.98      0.98      0.98      6499
weighted avg       0.98      0.98      0.98      6499

Accuracy on the Training Data:  0.98092014156024


In [12]:
preds = nb.predict(X_test)
print(classification_report(y_test, preds))
print('Accuracy: ', accuracy_score(y_test, preds))

              precision    recall  f1-score   support

           e       0.97      1.00      0.99       853
           p       1.00      0.97      0.98       772

    accuracy                           0.98      1625
   macro avg       0.99      0.98      0.98      1625
weighted avg       0.98      0.98      0.98      1625

Accuracy:  0.9846153846153847


In [13]:
ord = OrdinalEncoder()
X_train_encoded = ord.fit_transform(X_train)
y_train_encoded = [1 if label == 'p' else 0 for label in y_train]

# Categorical Naive Bayes with alpha = 0.1
NB = CategoricalNB(alpha=0.1)
NB.fit(X_train_encoded, y_train_encoded)

X_test_encoded = ord.transform(X_test)
predictions = ['p' if pred == 1 else 'e' for pred in NB.predict(X_test_encoded)]
print(classification_report(y_test, predictions))
print('Accuracy: ', accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           e       0.97      1.00      0.99       853
           p       1.00      0.97      0.98       772

    accuracy                           0.98      1625
   macro avg       0.99      0.98      0.98      1625
weighted avg       0.98      0.98      0.98      1625

Accuracy:  0.9846153846153847


## Alpha = 0.5

In [14]:
nb = NaiveBayesClassifier(alpha=0.5)
nb.fit(X_train, y_train)

Performance on the Training Data: 
               precision    recall  f1-score   support

           e       0.93      0.99      0.96      3355
           p       0.99      0.92      0.96      3144

    accuracy                           0.96      6499
   macro avg       0.96      0.96      0.96      6499
weighted avg       0.96      0.96      0.96      6499

Accuracy on the Training Data:  0.9604554546853362


In [15]:
preds = nb.predict(X_test)
print(classification_report(y_test, preds))
print('Accuracy: ', accuracy_score(y_test, preds))

              precision    recall  f1-score   support

           e       0.94      1.00      0.97       853
           p       1.00      0.92      0.96       772

    accuracy                           0.96      1625
   macro avg       0.97      0.96      0.96      1625
weighted avg       0.96      0.96      0.96      1625

Accuracy:  0.9624615384615385


In [16]:
ord = OrdinalEncoder()
X_train_encoded = ord.fit_transform(X_train)
y_train_encoded = [1 if label == 'p' else 0 for label in y_train]

# Categorical Naive Bayes with alpha = 0.1
NB = CategoricalNB(alpha=0.5)
NB.fit(X_train_encoded, y_train_encoded)

X_test_encoded = ord.transform(X_test)
predictions = ['p' if pred == 1 else 'e' for pred in NB.predict(X_test_encoded)]
print(classification_report(y_test, predictions))
print('Accuracy: ', accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           e       0.94      1.00      0.97       853
           p       1.00      0.92      0.96       772

    accuracy                           0.96      1625
   macro avg       0.97      0.96      0.96      1625
weighted avg       0.96      0.96      0.96      1625

Accuracy:  0.9624615384615385


## Alpha = 1

In [17]:
nb = NaiveBayesClassifier(alpha=1)
nb.fit(X_train, y_train)

Performance on the Training Data: 
               precision    recall  f1-score   support

           e       0.92      0.99      0.96      3355
           p       0.99      0.91      0.95      3144

    accuracy                           0.95      6499
   macro avg       0.96      0.95      0.95      6499
weighted avg       0.96      0.95      0.95      6499

Accuracy on the Training Data:  0.953839052161871


In [18]:
preds = nb.predict(X_test)
print(classification_report(y_test, preds))
print('Accuracy: ', accuracy_score(y_test, preds))

              precision    recall  f1-score   support

           e       0.92      1.00      0.96       853
           p       1.00      0.91      0.95       772

    accuracy                           0.95      1625
   macro avg       0.96      0.95      0.95      1625
weighted avg       0.96      0.95      0.95      1625

Accuracy:  0.9532307692307692


In [19]:
ord = OrdinalEncoder()
X_train_encoded = ord.fit_transform(X_train)
y_train_encoded = [1 if label == 'p' else 0 for label in y_train]

# Categorical Naive Bayes with alpha = 1
NB = CategoricalNB(alpha=1)
NB.fit(X_train_encoded, y_train_encoded)

X_test_encoded = ord.transform(X_test)
predictions = ['p' if pred == 1 else 'e' for pred in NB.predict(X_test_encoded)]
print(classification_report(y_test, predictions))
print('Accuracy: ', accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           e       0.92      1.00      0.96       853
           p       1.00      0.91      0.95       772

    accuracy                           0.95      1625
   macro avg       0.96      0.95      0.95      1625
weighted avg       0.96      0.95      0.95      1625

Accuracy:  0.9532307692307692
