Importing libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

Loading data from spam-data.txt

In [2]:
data = []
with open('spam-data.txt', 'r') as file:
    for line in file:
        label, message = line.strip().split('\t', 1)
        data.append({'label': label, 'message': message})

df = pd.DataFrame(data)
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
df.head()

Dataset shape: (20, 2)

First 5 rows:


Unnamed: 0,label,message
0,ham,"Hey, how are you doing today?"
1,ham,Lets meet for lunch tomorrow
2,spam,WINNER!! You won $1000000 call now to claim
3,ham,Remember to buy milk on your way home
4,spam,URGENT: Your account has been compromised


Encoding labels

In [3]:
df['label_encoded'] = df['label'].map({'ham': 0, 'spam': 1})
print("Label distribution:")
print(df['label'].value_counts())

Label distribution:
label
ham     12
spam     8
Name: count, dtype: int64


Splitting data

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    df['message'],
    df['label_encoded'],
    test_size=0.2,
    random_state=1
)

print('Training set:', X_train.shape[0])
print('Test set:', X_test.shape[0])

Training set: 16
Test set: 4


Bag of Words

In [5]:
count_vector = CountVectorizer(lowercase=True)
training_data = count_vector.fit_transform(X_train)
testing_data = count_vector.transform(X_test)

print("vocabulary size:", training_data.shape[1])

vocabulary size: 59


Training model

In [6]:
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)
print('model is trained')

model is trained


Evaluating model

In [7]:
predictions = naive_bayes.predict(testing_data)
accuracy = accuracy_score(y_test, predictions)
print('accuracy: {:.1f}%'.format(100 * accuracy))
print("\nConfusion matrix:")
print(confusion_matrix(y_test, predictions))

accuracy: 75.0%

Confusion matrix:
[[1 1]
 [0 2]]


spam message example

In [12]:
messages = [
    'CONGRATS! You won a $5000 job grant click to claim',
    'Hi Mariam, we want to interview you for the Developer role',
    'URGENT! Your profile won a badge claim now',
    'Following up on your backend engineer application',
    'FREE work from home job $8000/month no experience',
    'We have questions about your CS graduate projects'
]
data = count_vector.transform(messages)
predictions = naive_bayes.predict(data)

print("Testing Spam Detector:\n")
for msg, pred in zip(messages, predictions):
    result = 'SPAM' if pred == 1 else 'LEGITIMATE'
    print(f"{result}: {msg}")

Testing Spam Detector:

SPAM: CONGRATS! You won a $5000 job grant click to claim
LEGITIMATE: Hi Mariam, we want to interview you for the Developer role
SPAM: URGENT! Your profile won a badge claim now
LEGITIMATE: Following up on your backend engineer application
SPAM: FREE work from home job $8000/month no experience
LEGITIMATE: We have questions about your CS graduate projects
