# Spam Classification Using BOW, TF-IDF and Machine Learning Algorithm (Naive Bayes)


In [1]:
import numpy as np
import pandas as pd
import nltk

nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mahes\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\mahes\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [11]:
import os 
path = os.path.join('sms_spam_data', 'SMSSpamCollection')
df = pd.read_csv(path, sep='\t', names=['target', 'message'])
df.head()

Unnamed: 0,target,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Data Cleaning and Preprocessing

In [147]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

In [148]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [149]:
df['message'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [150]:
corpus = []

for i in range(len(df)):
    tokens = nltk.word_tokenize(df['message'][i].lower())
    tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
    review = ' '.join(tokens)
    corpus.append(review)

In [151]:
df['message'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [152]:
corpus[0]

'go jurong point , crazi .. avail bugi n great world la e buffet ... cine got amor wat ...'

In [153]:
df['message'][2]

"Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"

In [154]:
corpus[2]

"free entri 2 wkli comp win fa cup final tkt 21st may 2005. text fa 87121 receiv entri question ( std txt rate ) & c 's appli 08452810075over18 's"

In [155]:
corpus[:10]

['go jurong point , crazi .. avail bugi n great world la e buffet ... cine got amor wat ...',
 'ok lar ... joke wif u oni ...',
 "free entri 2 wkli comp win fa cup final tkt 21st may 2005. text fa 87121 receiv entri question ( std txt rate ) & c 's appli 08452810075over18 's",
 'u dun say earli hor ... u c alreadi say ...',
 "nah n't think goe usf , live around though",
 "freemsg hey darl 's 3 week 's word back ! 'd like fun still ? tb ok ! xxx std chg send , £1.50 rcv",
 'even brother like speak . treat like aid patent .',
 "per request 'mell mell ( oru minnaminungint nurungu vettam ) ' set callertun caller . press * 9 copi friend callertun",
 'winner ! ! valu network custom select receivea £900 prize reward ! claim call 09061701461. claim code kl341 . valid 12 hour .',
 'mobil 11 month ? u r entitl updat latest colour mobil camera free ! call mobil updat co free 08002986030']

In [156]:
# how many spams and how many hams are there in the dataset
df['target'].value_counts()

target
ham     4825
spam     747
Name: count, dtype: int64

In [157]:
# how many spams and how many hams are there in the dataset in percentage
df['target'].value_counts(normalize=True) * 100

target
ham     86.593683
spam    13.406317
Name: proportion, dtype: float64

target

ham     86.593683

spam    13.406317

data is imbalanced, more ham than spam

So while splitting the data into train and test, we need to make sure that both sets have similar distribution of spam and ham messages. We can use stratified sampling for this purpose.

In [158]:
from sklearn.model_selection import train_test_split
# splitting the data into train and test sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(corpus, df['target'], test_size=0.2, random_state=42, stratify=df['target'])

In [159]:
X_train[:5]

[', guy close ?',
 'pleas come imin town.dontmatt urgoin outl8r , reallyne 2docd.pleas dontpleas dontignor mycal , u thecd isv.import tome 4 2moro',
 'ok k .. sri knw 2 siva .. tat askd ..',
 "'ll see , prolli yeah",
 "'ll see swing bit , got thing take care firsg"]

In [160]:
X_test[:5]

['need buy lunch .. eat maggi mee ..',
 'ok im sure time finish tomorrow wan na spend even co would vewi vewi lubli ! love xxx',
 'wait e car 4 mum lor . u leh ? reach home alreadi ?',
 '? 1,000 cash ? 2,000 prize ! claim , call09050000327',
 'r @ home come within 5 min']

In [161]:
y_train[:5], y_test[:5]

(184     ham
 2171    ham
 5422    ham
 4113    ham
 4588    ham
 Name: target, dtype: object,
 2825     ham
 3695     ham
 3904     ham
 576     spam
 2899     ham
 Name: target, dtype: object)

In [162]:
# how many spams and how many hams are there in both train and test sets
y_train.value_counts(), y_test.value_counts()

(target
 ham     3859
 spam     598
 Name: count, dtype: int64,
 target
 ham     966
 spam    149
 Name: count, dtype: int64)

In [163]:
# how many spams and how many hams are there in the train and test sets interms of percentage
y_train.value_counts(normalize=True) * 100, y_test.value_counts(normalize=True) * 100

(target
 ham     86.582903
 spam    13.417097
 Name: proportion, dtype: float64,
 target
 ham     86.636771
 spam    13.363229
 Name: proportion, dtype: float64)

## Bag of words

In [164]:
# Creating Bag of Words for both train and test sets
from sklearn.feature_extraction.text import CountVectorizer
# for Binary BOW enable binary=True

count_vectorizer = CountVectorizer(max_features=2500, ngram_range=(1,2))

In [165]:
# bag of words for train set
X_train_bow = count_vectorizer.fit_transform(X_train).toarray()
# bag of words for test set
X_test_bow = count_vectorizer.transform(X_test).toarray()

In [166]:
np.set_printoptions(edgeitems=30, linewidth=100000, 
    formatter=dict(float=lambda x: "%.3g" % x))
X_train_bow[:5]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], shape=(5, 2500))

In [167]:
count_vectorizer.get_feature_names_out()

array(['00', '00 sub', '000', '000 cash', '02', '03', '03 2nd', '04', '05', '06', '06 03', '0800', '0800 542', '08000839402', '08000839402 call2optout', '08000930705', '08000930705 deliveri', '08001950382', '0870', '0870 nation', '08707509020', '08707509020 20p', '08712300220', '08712300220 quot', '08712460324', '08715705022', '08718720201', '08718727870', '09050090044', '09050090044 toclaim', ..., 'yan', 'yan jiu', 'yar', 'yar lor', 'yay', 'ye', 'ye 85023', 'ye princess', 'yeah', 'yeah got', 'yeah probabl', 'year', 'year old', 'yep', 'yes', 'yest', 'yesterday', 'yet', 'yet chikku', 'yiju', 'yo', 'yo yo', 'yoga', 'you', 'your', 'yr', 'yun', 'yuo', 'yup', 'zed'], shape=(2500,), dtype=object)

In [168]:
count_vectorizer.vocabulary_

{'guy': np.int64(986),
 'close': np.int64(493),
 'pleas': np.int64(1655),
 'come': np.int64(513),
 'town': np.int64(2193),
 'import': np.int64(1107),
 'ok': np.int64(1557),
 'knw': np.int64(1191),
 'tat': np.int64(2085),
 'askd': np.int64(265),
 'll': np.int64(1261),
 'see': np.int64(1868),
 'yeah': np.int64(2478),
 'll see': np.int64(1268),
 'swing': np.int64(2070),
 'bit': np.int64(336),
 'got': np.int64(944),
 'thing': np.int64(2127),
 'take': np.int64(2076),
 'care': np.int64(427),
 'take care': np.int64(2077),
 'shall': np.int64(1905),
 'book': np.int64(350),
 'half': np.int64(995),
 'thank': np.int64(2118),
 'messag': np.int64(1384),
 'realli': np.int64(1762),
 'appreci': np.int64(244),
 'sure': np.int64(2061),
 'process': np.int64(1717),
 'direct': np.int64(662),
 'pay': np.int64(1612),
 'find': np.int64(808),
 'way': np.int64(2362),
 'back': np.int64(296),
 'test': np.int64(2106),
 'tomorrow': np.int64(2171),
 'class': np.int64(489),
 'wonder': np.int64(2436),
 'day': np.int64(

The difference between .get_feature_names_out() and .vocabulary_ is that the former returns an array of feature names (words or n-grams) in the order they correspond to the columns of the transformed data matrix, while the latter provides a dictionary mapping each feature name to its corresponding column index in the matrix.

In [169]:
y_train[:5]

184     ham
2171    ham
5422    ham
4113    ham
4588    ham
Name: target, dtype: object

In [171]:
# One hot encoding the target labels with pandas
y_train = pd.get_dummies(y_train, drop_first=True, dtype=int)
y_test = pd.get_dummies(y_test, drop_first=True, dtype=int)

In [172]:
y_train[:5]

Unnamed: 0,spam
184,0
2171,0
5422,0
4113,0
4588,0


## Model Training and Evaluation

In [173]:
from sklearn.naive_bayes import MultinomialNB
spam_detection_model = MultinomialNB().fit(X_train_bow, y_train)

  y = column_or_1d(y, warn=True)


In [174]:
y_pred = spam_detection_model.predict(X_test_bow)

In [175]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [176]:
accuracy_score(y_test, y_pred)

0.9838565022421525

In [179]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.97      0.91      0.94       149

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [180]:
confusion_matrix(y_test, y_pred)

array([[962,   4],
       [ 14, 135]])

## TF-IDF Vectorization

In [183]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [184]:
tf_idf_vectorizer = TfidfVectorizer(max_features=2500, ngram_range=(1,2))
X_train_tfidf = tf_idf_vectorizer.fit_transform(X_train)
X_test_tfidf = tf_idf_vectorizer.transform(X_test)

In [185]:
X_train_tfidf[:5]

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 24 stored elements and shape (5, 2500)>

In [186]:
tfidf_model = MultinomialNB().fit(X_train_tfidf, y_train)

  y = column_or_1d(y, warn=True)


In [193]:
y_pred = tfidf_model.predict(X_test_tfidf)

In [194]:
accuracy_score(y_test, y_pred)

0.9766816143497757

In [195]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99       966
           1       1.00      0.83      0.90       149

    accuracy                           0.98      1115
   macro avg       0.99      0.91      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [196]:
confusion_matrix(y_test, y_pred)

array([[966,   0],
       [ 26, 123]])