In [21]:
import pandas as pd

# Load with proper encoding
df = pd.read_csv("../datasets/spam.csv", encoding='ISO-8859-1')

# Keep only relevant columns
df = df.iloc[:, :2]
df.columns = ['label', 'message']

# Preview
print(df.head())
print(df['label'].value_counts())


  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
label
ham     4825
spam     747
Name: count, dtype: int64


In [22]:
import re

def preprocess(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove punctuation
    tokens = text.split()  # Tokenize by whitespace
    return tokens

# Apply to the dataframe
df['tokens'] = df['message'].apply(preprocess)

# Show first few
print(df[['message', 'tokens']].head())


                                             message  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   
2  Free entry in 2 a wkly comp to win FA Cup fina...   
3  U dun say so early hor... U c already then say...   
4  Nah I don't think he goes to usf, he lives aro...   

                                              tokens  
0  [go, until, jurong, point, crazy, available, o...  
1                     [ok, lar, joking, wif, u, oni]  
2  [free, entry, in, 2, a, wkly, comp, to, win, f...  
3  [u, dun, say, so, early, hor, u, c, already, t...  
4  [nah, i, dont, think, he, goes, to, usf, he, l...  


In [23]:
import sys
import os

# Add the parent directory (project root) to the path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [24]:
import importlib
import ml_algorithms.naive_bayes
importlib.reload(ml_algorithms.naive_bayes)

from ml_algorithms.naive_bayes import NaiveBayes

# Instantiate and fit the model
nb = NaiveBayes()
nb.fit(df['tokens'], df['label'])

# Check the class counts and vocabulary
print(nb.get_class_counts())
print(nb.get_vocabulary())


{'ham': 4825, 'spam': 747}


In [25]:

from utils.train_test_split import train_test_split_custom
import numpy as np

X = np.array(df['tokens'].tolist(), dtype=object)
y = np.array(df['label'].tolist())

X_train, X_test, y_train, y_test = train_test_split_custom(X, y, test_size=0.2)


In [26]:

from ml_algorithms.naive_bayes import NaiveBayes

nb = NaiveBayes()
nb.fit(X_train, y_train)
# Get predictions for the test set
y_pred = nb.predict(X_test)

accuracy = nb.score(X_test, y_test)
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 96.86%
