In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('mushrooms.csv')

In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
encoding_list = {}
# encoding_list is for list of encoding i.e
# {
#   'class': LabelEncoder(),
#   'cap-shape': LabelEncoder(),
#    and so on upto features
#                             }
mappers = {}

# mappers means, assign numbers to categories i.e
# {
#    'class': {'e': 0, 'p': 1}
#    'cap-shape': {'b': 0, 'c': 1, 'f': 2, 'k': 3, 's': 4, 'x': 5
#                                                                 }

for i in data.columns:
    encoders = LabelEncoder()
    data[i] = encoders.fit_transform(data[i])
    encoding_list[i] = encoders
    mappers[i] = dict(zip(encoders.classes_, encoders.transform(encoders.classes_)))

In [5]:
X = data.drop(columns=['class'], axis=1)
y = data['class']

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [7]:
y.shape

(8124,)

In [8]:
def calculate_prior(y):
    classes, counts = np.unique(y, return_counts=True)
    total = len(y)

    priors = {}
    for c, count in zip(classes, counts):
        priors[c] = np.log(count / total)   # LOG PRIOR

    return priors

In [9]:
print(calculate_prior(y))

{np.int64(0): np.float64(-0.6578351682827296), np.int64(1): np.float64(-0.7297519190498744)}


In [10]:
def calculate_likelihoods(X, y):
    X = np.array(X)
    y = np.array(y)

    classes = np.unique(y)
    n_features = X.shape[1]

    likelihoods = {}
    feature_unique_values = {}

    # Count total unique values per feature
    for i in range(n_features):
        feature_unique_values[i] = len(np.unique(X[:, i]))

    for c in classes:
        likelihoods[c] = {}
        X_c = X[y == c]
        N_c = len(X_c)

        for i in range(n_features):
            likelihoods[c][i] = {}

            K = feature_unique_values[i]  # total possible values
            values, counts = np.unique(X_c[:, i], return_counts=True)

            # Create dictionary for all possible values
            for v in np.unique(X[:, i]):
                count = counts[values.tolist().index(v)] if v in values else 0
                prob = (count + 1) / (N_c + K)
                likelihoods[c][i][v] = np.log(prob)

    return likelihoods, feature_unique_values

In [11]:
print(calculate_likelihoods(X, y))

({np.int64(0): {0: {np.int64(0): np.float64(-2.342280527257595), np.int64(1): np.float64(-8.346167594364134), np.int64(2): np.float64(-0.9702854461491219), np.int64(3): np.float64(-2.9124455908098947), np.int64(4): np.float64(-4.8496600328976545), np.int64(5): np.float64(-0.7710958948565736)}, 1: {np.int64(0): np.float64(-0.9926109527384341), np.int64(1): np.float64(-8.345692873253865), np.int64(2): np.float64(-1.3025329572655262), np.int64(3): np.float64(-1.0291446960708899)}, 2: {np.int64(0): np.float64(-4.4552960629280935), np.int64(1): np.float64(-4.85060879957224), np.int64(2): np.float64(-1.9093647113023184), np.int64(3): np.float64(-1.4068938919190812), np.int64(4): np.float64(-1.2042889598770994), np.int64(5): np.float64(-4.304065093204169), np.int64(6): np.float64(-5.513903016982503), np.int64(7): np.float64(-5.513903016982503), np.int64(8): np.float64(-1.7664772237537707), np.int64(9): np.float64(-2.3531549337321507)}, 3: {np.int64(0): np.float64(-1.0610831204812223), np.int6

In [12]:
def predict(X_test, priors, likelihoods):
    X_test = np.array(X_test)
    predictions = []

    for sample in X_test:
        class_scores = {}

        for c in priors:
            log_prob = priors[c]

            for i, value in enumerate(sample):
                log_prob += likelihoods[c][i][value]

            class_scores[c] = log_prob

        predicted_class = max(class_scores, key=class_scores.get)
        predictions.append(predicted_class)

    return np.array(predictions)

In [13]:
priors = calculate_prior(y_train)
likelihoods, feature_unique_values = calculate_likelihoods(X_train, y_train)

y_pred = predict(X_test, priors, likelihoods)

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9507692307692308
Precision: 0.9915966386554622
Recall: 0.9053708439897699
F1 Score: 0.946524064171123
Confusion Matrix:
 [[837   6]
 [ 74 708]]


In [15]:
from sklearn.naive_bayes import CategoricalNB

model = CategoricalNB()
model.fit(X_train, y_train)

y_pred_sklearn = model.predict(X_test)

print("\nSKLEARN RESULTS")
print("Accuracy:", accuracy_score(y_test, y_pred_sklearn))
print("Precision:", precision_score(y_test, y_pred_sklearn))
print("Recall:", recall_score(y_test, y_pred_sklearn))
print("F1 Score:", f1_score(y_test, y_pred_sklearn))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_sklearn))


SKLEARN RESULTS
Accuracy: 0.9507692307692308
Precision: 0.9915966386554622
Recall: 0.9053708439897699
F1 Score: 0.946524064171123
Confusion Matrix:
 [[837   6]
 [ 74 708]]


In [16]:
def predict_from_csv(input_file, priors, likelihoods, encoding_list, feature_columns, output_file=None):
    # Load file
    data = pd.read_csv(input_file)

    # Encode using SAME encoders from training
    for col in feature_columns:
        if col in data.columns:
            encoder = encoding_list[col]
            data[col] = encoder.transform(data[col])

    # Keep only feature columns
    X_input = data[feature_columns]

    # Predict
    y_pred = predict(X_input, priors, likelihoods)

    # Convert prediction back to original class labels
    class_encoder = encoding_list['class']
    y_pred_labels = class_encoder.inverse_transform(y_pred)

    # Save output if required
    if output_file:
        output_df = pd.DataFrame({"prediction": y_pred_labels})
        output_df.to_csv(output_file, index=False)

    return y_pred_labels

In [18]:
feature_columns = X_train.columns

predictions = predict_from_csv(
    "mushroom_input.csv",
    priors,
    likelihoods,
    encoding_list,
    feature_columns,
    output_file="mushroom_output.csv"
)

print(predictions)

['e' 'e' 'p' 'e' 'p' 'p' 'p' 'e' 'p' 'e']
