In [None]:
!pip install datasets transformers scikit-learn

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from datasets import load_dataset
from scipy.sparse import csr_matrix

# Load Dataset
dataset = load_dataset('go_emotions')

# Filter data with only one label
def has_single_label(example):
    return len(example['labels']) == 1

train_dataset = dataset['train'].filter(has_single_label)
validation_dataset = dataset['validation'].filter(has_single_label)
test_dataset = dataset['test'].filter(has_single_label)

# Map numerical labels to names
label_mapping = dataset['train'].features['labels'].feature.names

def map_label(example):
    label_id = example['labels'][0]
    example['label_name'] = label_mapping[label_id]
    example['label'] = label_id
    return example

train_dataset = train_dataset.map(map_label)
validation_dataset = validation_dataset.map(map_label)
test_dataset = test_dataset.map(map_label)

# Convert to pandas DataFrame
train_df = train_dataset.to_pandas()
validation_df = validation_dataset.to_pandas()
test_df = test_dataset.to_pandas()

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit on the training data and transform
X_train_counts = vectorizer.fit_transform(train_df['text'])

# Transform the validation and test data
X_validation_counts = vectorizer.transform(validation_df['text'])
X_test_counts = vectorizer.transform(test_df['text'])

# Extract labels
Y_train = train_df['label'].values
Y_validation = validation_df['label'].values
Y_test = test_df['label'].values

# Naive Bayes Classifier
class NaiveBayesClassifier:
    def __init__(self):
        self.class_priors = None
        self.conditional_probs = None
        self.classes = None

    def fit(self, X_counts, Y, alpha=1.0):
        if not isinstance(X_counts, csr_matrix):
            X_counts = csr_matrix(X_counts)

        vocab_size = X_counts.shape[1]
        self.classes = np.unique(Y)
        num_classes = len(self.classes)

        class_counts = np.zeros(num_classes, dtype=np.float64)
        total_samples = len(Y)

        class_indices = {cls: idx for idx, cls in enumerate(self.classes)}

        word_counts_per_class = np.zeros((num_classes, vocab_size), dtype=np.float64)
        total_word_counts_per_class = np.zeros(num_classes, dtype=np.float64)

        for cls in self.classes:
            idx = class_indices[cls]
            cls_mask = (Y == cls)
            class_counts[idx] = np.sum(cls_mask)
            X_cls = X_counts[cls_mask]
            word_counts = np.array(X_cls.sum(axis=0)).flatten()
            word_counts_per_class[idx, :] = word_counts
            total_word_counts_per_class[idx] = word_counts.sum()

        self.class_priors = class_counts / total_samples

        self.conditional_probs = {}
        for cls in self.classes:
            idx = class_indices[cls]
            numerator = word_counts_per_class[idx] + alpha
            denominator = total_word_counts_per_class[idx] + alpha * vocab_size
            self.conditional_probs[cls] = numerator / denominator

    def predict(self, X_counts):
        if not isinstance(X_counts, csr_matrix):
            X_counts = csr_matrix(X_counts)

        predictions = []
        n_samples = X_counts.shape[0]

        class_indices = {cls: idx for idx, cls in enumerate(self.classes)}
        log_class_priors = np.log(self.class_priors)

        log_conditional_probs = {}
        for cls in self.classes:
            log_conditional_probs[cls] = np.log(self.conditional_probs[cls])

        for i in range(n_samples):
            x = X_counts.getrow(i)
            log_probs = []
            for cls in self.classes:
                idx = class_indices[cls]
                log_prob = log_class_priors[idx]
                indices = x.indices
                data = x.data
                log_likelihood = data @ log_conditional_probs[cls][indices]
                log_prob += log_likelihood
                log_probs.append(log_prob)
            predicted_class = self.classes[np.argmax(log_probs)]
            predictions.append(predicted_class)
        return predictions

    def evaluate_acc(self, Y_true, Y_pred):
        correct = np.sum(np.array(Y_true) == np.array(Y_pred))
        accuracy = correct / len(Y_true)
        return accuracy

# Train the classifier
nb_classifier = NaiveBayesClassifier()
nb_classifier.fit(X_train_counts, Y_train, alpha=1.0)

# Predict and evaluate
Y_pred = nb_classifier.predict(X_test_counts)
accuracy = nb_classifier.evaluate_acc(Y_test, Y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

# Classification Report
label_names = [label_mapping[i] for i in nb_classifier.classes]
print(classification_report(Y_test, Y_pred, target_names=label_names))

# Hyperparameter tuning
alphas = [0.01, 0.05, 0.1, 0.5, 1.0, 2.0]
best_alpha = None
best_accuracy = 0

for alpha in alphas:
    nb_classifier.fit(X_train_counts, Y_train, alpha=alpha)
    Y_val_pred = nb_classifier.predict(X_validation_counts)
    val_accuracy = nb_classifier.evaluate_acc(Y_validation, Y_val_pred)
    print(f"Alpha: {alpha}, Validation Accuracy: {val_accuracy:.4f}")
    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        best_alpha = alpha

print(f"Best Alpha: {best_alpha}, Best Validation Accuracy: {best_accuracy:.4f}")

# Retrain with best alpha
nb_classifier.fit(X_train_counts, Y_train, alpha=best_alpha)
Y_test_pred = nb_classifier.predict(X_test_counts)
test_accuracy = nb_classifier.evaluate_acc(Y_test, Y_test_pred)
print(f"Test Accuracy with Best Alpha: {test_accuracy:.4f}")

print(classification_report(Y_test, Y_test_pred, target_names=label_names))


Test Accuracy: 0.4214
                precision    recall  f1-score   support

    admiration       0.64      0.35      0.46       348
     amusement       0.82      0.15      0.25       186
         anger       0.50      0.02      0.04       131
     annoyance       0.31      0.02      0.04       194
      approval       0.43      0.04      0.07       236
        caring       0.75      0.03      0.07        86
     confusion       0.00      0.00      0.00        97
     curiosity       0.83      0.03      0.05       176
        desire       0.00      0.00      0.00        56
disappointment       0.00      0.00      0.00        88
   disapproval       0.00      0.00      0.00       195
       disgust       1.00      0.04      0.08        76
 embarrassment       0.00      0.00      0.00        23
    excitement       0.00      0.00      0.00        57
          fear       1.00      0.02      0.03        65
     gratitude       0.89      0.60      0.72       260
         grief       0.00

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Alpha: 0.01, Validation Accuracy: 0.4189
Alpha: 0.05, Validation Accuracy: 0.4483
Alpha: 0.1, Validation Accuracy: 0.4606
Alpha: 0.5, Validation Accuracy: 0.4573
Alpha: 1.0, Validation Accuracy: 0.4316
Alpha: 2.0, Validation Accuracy: 0.4022
Best Alpha: 0.1, Best Validation Accuracy: 0.4606
Test Accuracy with Best Alpha: 0.4523
                precision    recall  f1-score   support

    admiration       0.51      0.54      0.52       348
     amusement       0.59      0.49      0.53       186
         anger       0.33      0.18      0.24       131
     annoyance       0.23      0.18      0.20       194
      approval       0.23      0.16      0.19       236
        caring       0.24      0.20      0.22        86
     confusion       0.30      0.14      0.20        97
     curiosity       0.25      0.15      0.19       176
        desire       0.28      0.09      0.14        56
disappointment       0.28      0.10      0.15        88
   disapproval       0.19      0.10      0.13       1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
