In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re, pickle

from  sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_selection import SelectKBest, chi2

In [3]:
df = pd.read_csv('spam.csv', encoding='latin-1')
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [4]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [5]:
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)

In [6]:
df.columns

Index(['v1', 'v2'], dtype='object')

In [7]:
df.rename(columns={
    'v1': 'label',
    'v2': 'message'}, inplace=True)

In [8]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
df['label'].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [10]:
df['target'] = df['label'].map({'ham':0, 'spam': 1})
df

Unnamed: 0,label,message,target
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will Ì_ b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
 2   target   5572 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 130.7+ KB


In [12]:
df.describe()

# this might not give much info to us since we are having str data

Unnamed: 0,target
count,5572.0
mean,0.134063
std,0.340751
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [13]:
print("Average message length for label ham ", df[df['target']==0]['message'].str.len().mean())

Average message length for label ham  71.02362694300518


In [14]:
print("Average message length for label spam: ", df[df['target']==1]['message'].str.len().mean())

Average message length for label spam:  138.8661311914324


In [15]:
def cleanText(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]","",text)
    return text

df['cleaned'] = df['message'].apply(cleanText)

df.head()

Unnamed: 0,label,message,target,cleaned
0,ham,"Go until jurong point, crazy.. Available only ...",0,go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,0,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1,free entry in 2 a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,0,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",0,nah i dont think he goes to usf he lives aroun...


In [16]:
x = df['cleaned']
y = df['target']

In [17]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y, test_size=0.2)

In [18]:
vectorizer = CountVectorizer(binary=True, stop_words="english")
xtrainVect = vectorizer.fit_transform(xtrain)
xtestVect = vectorizer.transform(xtest)

### feature selection

In [19]:
chi2Score, pValue = chi2(xtrainVect, ytrain)
featureNames = vectorizer.get_feature_names_out()
importantFeatures = sorted(list(zip(chi2Score, featureNames)), reverse=True)[:20]

print("Top spam indicative words: ")
for score, word in importantFeatures:
    print(word, score)

Top spam indicative words: 
txt 687.3431783246343
claim 554.9770491803279
free 545.2562425554145
mobile 492.9897126566581
prize 435.1524590163935
won 365.78032786885257
stop 320.665121710109
reply 310.52875442812416
urgent 300.88405654821514
text 272.1407905760617
16 271.1819672131148
guaranteed 258.56885245901634
win 252.31874698237596
service 241.64876220346275
contact 239.9851222998413
nokia 230.16848855986584
cash 228.70329789352428
500 227.03606557377051
tone 214.4229508196721
customer 209.2583076103585


In [20]:
bnb = BernoulliNB()
bnb.fit(xtrainVect, ytrain)
ypred = bnb.predict(xtestVect)

print("Accuracy: ", accuracy_score(ytest, ypred))
print("Confusion matrix: ", confusion_matrix(ytest, ypred))
print(classification_report(ytest,ypred))

Accuracy:  0.9713004484304932
Confusion matrix:  [[976   2]
 [ 30 107]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       978
           1       0.98      0.78      0.87       137

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [21]:
params = {'alpha': [0.1, 0.5, 1.0, 2.0, 5.0]}
grid = GridSearchCV(BernoulliNB(), param_grid=params, cv=5, scoring='accuracy', n_jobs=1)
grid.fit(xtrainVect, ytrain)


print("Best parameter: ", grid.best_params_)
print("Best CV Accuracy: ", grid.best_score_)

Best parameter:  {'alpha': 0.1}
Best CV Accuracy:  0.9807033966974176


In [22]:
bestModel = grid.best_estimator_
ypredBest = bestModel.predict(xtestVect)
print("Test accuracy after tuning: ", accuracy_score(ytest, ypredBest))

Test accuracy after tuning:  0.9838565022421525


## TF-IDF instead of Count Vectorizer

TF-IDF = Term Frequency – Inverse Document Frequency

- CountVectorizer just counts words (binary or frequency).

- TF-IDF adds importance weighting → boosts unique discriminative words, downplays common ones.

- Usually improves model performance (esp. with Naive Bayes, SVM, Logistic Regression).


Example (SMS Spam Dataset)

Word “free” appears a lot in spam, but not much in ham.

- High TF in spam messages

- Moderate IDF (since not all docs contain it)

- High TF-IDF → important word for spam detection

Word “the” appears everywhere.

- High TF, but very low IDF

- Low TF-IDF → useless for classification


In [23]:
tfidf = TfidfVectorizer(binary=True, stop_words="english", ngram_range=(1,2), max_features=5000)

xtrainTD = tfidf.fit_transform(xtrain)
xtestTD = tfidf.transform(xtest)

In [25]:
bnbTD = BernoulliNB(alpha=grid.best_params_['alpha'])
bnbTD.fit(xtrainTD, ytrain)
ypredTD = bnbTD.predict(xtestTD)

print("TF-IDF accuracy", accuracy_score(ytest, ypredTD))

TF-IDF accuracy 0.979372197309417
