In [1]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
class CustomMultinomialNB:
    def __init__(self, alpha=1.0):
        self.alpha = alpha  # Smoothing parameter
        self.class_probs = None
        self.word_probs = None
        self.classes = None
        self.vocab = None

    def _calculate_class_probs(self, y):
        class_probs = {}
        total_samples = len(y)

        for c in self.classes:
            class_count = np.sum(y == c)
            class_probs[c] = (class_count + self.alpha) / (total_samples + len(self.classes) * self.alpha)

        return class_probs

    def _calculate_word_probs(self, X, y):
      word_probs = {}

      for c in self.classes:
          class_mask = (y == c)
          class_word_count = np.sum(X[class_mask, :], axis=0)
          total_words_in_class = np.sum(X[class_mask, :]) + len(self.vocab) * self.alpha

          # Laplace smoothing for sparse matrices
          word_probs[c] = np.log((class_word_count + self.alpha) / total_words_in_class)

      return word_probs


    def fit(self, X, y):
        self.classes = np.unique(y)
        self.vocab = np.unique(X)  # Use unique words in all texts as vocabulary

        self.class_probs = self._calculate_class_probs(y)
        self.word_probs = self._calculate_word_probs(X, y)

    def _predict_instance(self, x):
      scores = {}

      for c in self.classes:
          class_score = np.sum(np.asarray(x).ravel() * np.asarray(self.word_probs[c]).ravel()) + np.log(self.class_probs[c])
          scores[c] = class_score

      return max(scores, key=scores.get)


    def predict(self, X):
        predictions = []

        for i in range(X.shape[0]):
            instance = X[i, :]
            predicted_class = self._predict_instance(instance)
            predictions.append(predicted_class)

        return np.array(predictions)

In [3]:
data = fetch_20newsgroups()
categories = data.target_names

In [4]:
# Divide the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.2, random_state=42)

In [5]:
# Convert text data to a bag-of-words representation
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

In [6]:
# Create and train the custom Multinomial Naive Bayes classifier
custom_classifier = CustomMultinomialNB()
# Create and train the custom Multinomial Naive Bayes classifier
custom_classifier = CustomMultinomialNB()
try:
    custom_classifier.fit(X_train_bow, y_train)
except Exception as e:
    print("Error during training:", e)

In [7]:
# Predict labels for test data
predicted_labels = custom_classifier.predict(X_test_bow.toarray())

In [8]:
# Calculate accuracy score
accuracy = accuracy_score(y_test, predicted_labels)

print(f"Accuracy: {accuracy:.2%}")

Accuracy: 85.15%


In [10]:
# Print actual and predicted categories for all texts
for i, (actual_category, predicted_category) in enumerate(zip(y_test, predicted_labels)):
    actual_category_name = data.target_names[actual_category]
    predicted_category_name = data.target_names[predicted_category]

    print(f"Text {i+1}: Actual - {actual_category_name}, Predicted - {predicted_category_name}")


Text 1: Actual - comp.sys.mac.hardware, Predicted - comp.sys.mac.hardware
Text 2: Actual - comp.os.ms-windows.misc, Predicted - comp.sys.ibm.pc.hardware
Text 3: Actual - misc.forsale, Predicted - misc.forsale
Text 4: Actual - talk.politics.guns, Predicted - talk.politics.guns
Text 5: Actual - rec.sport.hockey, Predicted - rec.sport.hockey
Text 6: Actual - comp.sys.mac.hardware, Predicted - comp.sys.mac.hardware
Text 7: Actual - talk.politics.misc, Predicted - talk.politics.misc
Text 8: Actual - comp.windows.x, Predicted - comp.windows.x
Text 9: Actual - sci.space, Predicted - sci.space
Text 10: Actual - talk.politics.misc, Predicted - talk.politics.misc
Text 11: Actual - rec.autos, Predicted - rec.autos
Text 12: Actual - misc.forsale, Predicted - misc.forsale
Text 13: Actual - soc.religion.christian, Predicted - alt.atheism
Text 14: Actual - comp.sys.ibm.pc.hardware, Predicted - comp.graphics
Text 15: Actual - sci.crypt, Predicted - sci.crypt
Text 16: Actual - rec.sport.hockey, Predict