<a href="https://colab.research.google.com/github/namantam1/ml-ai-dnn/blob/main/Bag_of_Words_A_email_spam_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("https://github.com/codebasics/nlp-tutorials/raw/main/9_bag_of_words/spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.shape

(5572, 2)

In [8]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [9]:
df['spam'] = df.Category.apply(lambda x: 1 if x=='spam' else 0)

df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df.Message,
    df.spam, 
    test_size=0.2,
    random_state=1
)

display(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(4457,)

(1115,)

(4457,)

(1115,)

In [11]:
X_train[:2]

1642    Hi , where are you? We're at  and they're not ...
2899          If you r @ home then come down within 5 min
Name: Message, dtype: object

In [34]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

X_train_vectorized = vectorizer.fit_transform(X_train).A

display(X_train_vectorized.shape, X_train_vectorized[:2])

(4457, 7711)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [35]:
vocab = vectorizer.get_feature_names_out()

display(vocab.shape, vocab[1000:1010])

(7711,)

array(['anythin', 'anything', 'anythingtomorrow', 'anytime', 'anyway',
       'anyways', 'anywhere', 'aom', 'apart', 'apartment'], dtype=object)

In [36]:
vectorizer.vocabulary_["anything"]

1001

In [37]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB

gaussianNB = GaussianNB()
gaussianNB.fit(X_train_vectorized, y_train)

X_test_vectorized = vectorizer.transform(X_test).A
display(X_test_vectorized.shape, X_test_vectorized[:2])

test_score = gaussianNB.score(X_test_vectorized, y_test)
train_score = gaussianNB.score(X_train_vectorized, y_train)

display(test_score, train_score)

(1115, 7711)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

0.9022421524663677

0.9470495849225937

In [38]:
multinomialNB = MultinomialNB()
multinomialNB.fit(X_train_vectorized, y_train)

X_test_vectorized = vectorizer.transform(X_test).A
display(X_test_vectorized.shape, X_test_vectorized[:2])

test_score = multinomialNB.score(X_test_vectorized, y_test)
train_score = multinomialNB.score(X_train_vectorized, y_train)

display(test_score, train_score)

(1115, 7711)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

0.989237668161435

0.9919228180390397

In [43]:
from sklearn.metrics import classification_report

y_pred = multinomialNB.predict(X_test_vectorized)

display(y_pred[50:60])

print(classification_report(y_test, y_pred))

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0])

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       968
           1       0.98      0.94      0.96       147

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [52]:
# Do some manual testing

emails = [
    "How are you! Hope you are there.",
    "Urgent! Job oppurtunity is here. Click on the below to link to grab it now. Don't miss this opportunity.",
    "Hurry! Get upto 90% discount."
]
emails_vectorized = vectorizer.transform(emails).A
display(emails_vectorized)

multinomialNB.predict(emails_vectorized)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

array([0, 0, 1])

In [55]:
# Lets create a pipeline for spam classification
from sklearn.pipeline import make_pipeline

model = make_pipeline(
    CountVectorizer(),
    MultinomialNB()
)
model

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('multinomialnb', MultinomialNB())])

In [61]:
model.fit(X_train, y_train)

X_train[:2]

1642    Hi , where are you? We're at  and they're not ...
2899          If you r @ home then come down within 5 min
Name: Message, dtype: object

In [59]:
model.predict(emails)

array([0, 0, 1])

In [63]:
model.score(X_test, y_test)

0.989237668161435

In [66]:
real_texts = [
"""
Hey namantam1!
A third-party OAuth application (Colaboratory) with gist and public_repo scopes was recently authorized to access your account.
Visit https://github.com/settings/connections/applications/5036cf6d81e65aaa6340 for more information.
To see this and other security events for your account, visit https://github.com/settings/security-log
If you run into problems, please contact support by visiting https://github.com/contact
Thanks,
The GitHub Team
"""
]


# obviously wrong as train data is not enough
model.predict(real_texts)

array([1])