In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [2]:
# Load the dataset
df = pd.read_csv('../data/spam.csv', encoding='latin-1')

# Drop unused columns and rename
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

# Show the first 5 rows
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    return text

# Apply the cleaning to every message
df['message'] = df['message'].apply(preprocess_text)

df.head()

Unnamed: 0,label,message
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...


In [4]:
# Split the data
X = df['message']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Total data: {len(df)}, Training data: {len(X_train)}, Testing data: {len(X_test)}")

Total data: 5572, Training data: 4457, Testing data: 1115


In [5]:
# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=3000) # Only look at the top 3000 words

# Fit and transform the training data
X_train_tfidf = tfidf.fit_transform(X_train)

# Only transform the test data
X_test_tfidf = tfidf.transform(X_test)

In [6]:
# Initialize the Naive Bayes model
model = MultinomialNB()

# Train the model
model.fit(X_train_tfidf, y_train)

print("✅ Model trained!")

✅ Model trained!


In [7]:
# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# Check the accuracy
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

# Print a detailed report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.97

Classification Report:
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       965
        spam       1.00      0.79      0.88       150

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115

