In [49]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import defaultdict
import string
import math
import pickle

from sklearn.model_selection import train_test_split

class NaiveBayes:
    def __init__(self, alpha=1.0):
        self.class_probabilities = {}
        self.word_probabilities = defaultdict(dict)
        self.classes = []
        self.alpha = alpha

    def fit(self, X_train, y_train):
        total_samples = len(y_train)
        self.classes = np.unique(y_train)

        for class_ in self.classes:
            class_indices = np.where(y_train == class_)[0]
            class_texts = [X_train[i] for i in class_indices]
            class_words = ' '.join(class_texts).split()
            word_counts = defaultdict(float)
            total_words = len(class_words)

            for word in class_words:
                word_counts[word] += 1

            self.word_probabilities[class_] = {word: (count + self.alpha) / (total_words + self.alpha * (len(word_counts) + 1)) for word, count in word_counts.items()}

            self.class_probabilities[class_] = len(class_indices) / total_samples

    def predict(self, X_test):
        predictions = []
        for text in X_test:
            best_class = None
            max_prob = float('-inf')
            for class_ in self.classes:
                prob = math.log(self.class_probabilities[class_])
                for word in text.split():
                    prob += math.log(self.word_probabilities[class_].get(word, self.alpha / (len(self.word_probabilities[class_]) + 1)))  # Laplace smoothing
                if prob > max_prob:
                    max_prob = prob
                    best_class = class_
            predictions.append(best_class)
        return predictions

#read data
spam_df = pd.read_csv('../spam.csv', encoding='latin-1')
spam_df = spam_df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
spam_df['label'] = spam_df['label'].map({'ham': 0, 'spam': 1})
spam_df = spam_df.drop_duplicates(keep='first')

ps = PorterStemmer()

def transform_text(text):
    text = text.lower() #convert to lowercase
    text = nltk.word_tokenize(text) #tokenize the text
    y = []

    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation: #remove stopwords and punctuation
            y.append(ps.stem(i)) #stem the words
    
    return " ".join(y) 

spam_df['transform_message'] = spam_df['message'].apply(transform_text)

#split data
X = spam_df['transform_message'].values
y = spam_df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

#train model
model = NaiveBayes(alpha=0.1)
model.fit(X_train, y_train)

#predict
y_pred = model.predict(X_test)

#print accuracy
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

#save model
with open('model_naive_bayes.pkl', 'wb') as file:
    pickle.dump(model, file)


              precision    recall  f1-score   support

           0       0.99      0.97      0.98       896
           1       0.84      0.95      0.89       138

    accuracy                           0.97      1034
   macro avg       0.92      0.96      0.94      1034
weighted avg       0.97      0.97      0.97      1034

[[871  25]
 [  7 131]]
0.9690522243713733


#test model 
with open('model_naive_bayes_improved.pkl', 'rb') as file:
    model = pickle.load(file)
    
message = [
    "This is a test message for spam detection.",
    "Buy one get one free! Limited time offer!",
    "Hey, how's it going? Are you free this weekend?",
    "Congratulations! You've won a free vacation!",
    "Reminder: Your appointment is tomorrow at 2 PM.",
]
message = [transform_text(m) for m in message]
print(model.predict(message))

In [61]:
test = [0, 1]
import numpy as np

classes = np.unique(test)
for class_ in classes:
        class_indices = np.where(y_train == class_)[0]
        class_texts = [X_train[i] for i in class_indices]

        print(class_indices)

[   0    1    2 ... 4132 4133 4134]
[   8   33   35   50   54   71   72   83   90   99  123  125  133  135
  153  168  174  200  203  214  224  231  235  238  249  268  277  297
  319  330  337  338  340  342  349  355  364  366  376  407  409  413
  446  449  458  464  466  468  481  493  504  510  511  517  522  525
  539  542  546  549  559  564  566  567  574  584  610  611  615  620
  635  644  648  655  660  662  672  679  697  698  717  733  735  751
  760  779  785  798  799  811  824  843  856  873  889  891  895  916
  917  919  923  930  950  953  956  958  966  974  982  987  989  998
 1000 1007 1013 1025 1043 1048 1065 1077 1078 1090 1108 1111 1114 1128
 1141 1147 1148 1156 1183 1184 1191 1193 1205 1230 1239 1246 1254 1260
 1263 1276 1279 1280 1295 1297 1301 1316 1320 1325 1326 1331 1333 1340
 1342 1343 1356 1360 1367 1390 1404 1406 1408 1411 1416 1428 1434 1435
 1466 1480 1487 1505 1507 1508 1527 1529 1541 1556 1574 1577 1584 1590
 1593 1629 1640 1648 1650 1658 1663 1674 