In [6]:
import pandas as pd

df = pd.read_csv("data/spam.csv", encoding="latin-1")
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

df.head()



Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
df['label'] = df['label'].map({
    'spam': 'scam',
    'ham': 'safe'
})


In [8]:
import re
import string

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    return text


In [9]:
df['clean_message'] = df['message'].apply(clean_text)
df.head()


Unnamed: 0,label,message,clean_message
0,safe,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,safe,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,scam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final...
3,safe,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,safe,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer


In [11]:
X = df['clean_message']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)



In [12]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)
y_pred[:10]



array(['safe', 'safe', 'safe', 'scam', 'safe', 'safe', 'safe', 'safe',
       'safe', 'safe'], dtype='<U4')

In [14]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
accuracy = accuracy_score(y_test, y_pred)
accuracy
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
cm


              precision    recall  f1-score   support

        safe       0.96      1.00      0.98       966
        scam       1.00      0.75      0.86       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115



array([[966,   0],
       [ 37, 112]])

In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.imshow(cm)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.colorbar()
plt.show()
