In [1]:
import pandas as pd

In [4]:
messages=pd.read_csv('SMSSpamCollection', sep='\t',names=['label','message'])

In [5]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [7]:
## Data Cleaning

import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mahewash\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [16]:
ps=PorterStemmer()
corpus=[]
for i in range(len(messages)):
    review=re.sub('[^a-zA-Z]',' ',messages['message'][i])
    review=review.lower()
    review=review.split()
    review=[ps.stem(word) for word in review if word not in set(stopwords.words('english'))]
    review=' '.join(review)
    corpus.append(review)

In [18]:
## creating Bag of words
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000)
X=cv.fit_transform(corpus).toarray()

In [23]:
X.shape

(5572, 5000)

In [52]:
y=pd.get_dummies(messages['label'],drop_first=True)
y=y.values.flatten()
y.shape


(5572,)

In [53]:
## Train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)



In [54]:
## Spam detection model
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

classifier=MultinomialNB()
classifier.fit(X_train,y_train)

y_pred=classifier.predict(X_test)
report=classification_report(y_test,y_pred)
accuracy=accuracy_score(y_test,y_pred)
confusion_matrix=confusion_matrix(y_test,y_pred)
print('Classification report :',report)
print('Accuracy: ', accuracy)
print('confusion matrix :', confusion_matrix)

Classification report :               precision    recall  f1-score   support

           0       0.97      1.00      0.98      1207
           1       0.99      0.77      0.87       186

    accuracy                           0.97      1393
   macro avg       0.98      0.89      0.93      1393
weighted avg       0.97      0.97      0.97      1393

Accuracy:  0.9691313711414213
confusion matrix : [[1206    1]
 [  42  144]]


In [55]:
## Implementation using TF-IDF model

from sklearn.feature_extraction.text import TfidfVectorizer
cv_tfidf=TfidfVectorizer()
X_tfidf=cv_tfidf.fit_transform(corpus).toarray()


In [56]:
X_tfidf.shape

(5572, 6296)

In [59]:
y_tfidf=pd.get_dummies(messages['label'],drop_first=True)
y_tfidf=y_tfidf.values.flatten()

In [62]:
## Train test split

Xt_train,Xt_test,yt_train,yt_test=train_test_split(X_tfidf,y_tfidf,test_size=0.25,random_state=0)



In [63]:
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
clas=MultinomialNB()
clas.fit(Xt_train,yt_train)

yt_pred=clas.predict(Xt_test)
report_tfidf=classification_report(yt_test,yt_pred)
accuracy_tfidf=accuracy_score(yt_test,yt_pred)
confusion_matrix_tfidf=confusion_matrix(y_test,y_pred)
print('Classification report :',report_tfidf)
print('Accuracy: ', accuracy_tfidf)
print('confusion matrix :', confusion_matrix_tfidf)
cv=cross_val_score()

Classification report :               precision    recall  f1-score   support

           0       0.97      1.00      0.98      1208
           1       1.00      0.78      0.88       185

    accuracy                           0.97      1393
   macro avg       0.98      0.89      0.93      1393
weighted avg       0.97      0.97      0.97      1393

Accuracy:  0.9712849964106246
confusion matrix : [[1206    1]
 [  42  144]]


###### Observation


We can observe that the accuracy is slightly improved using TF-IDF vectorizer as compared to using Bag of words.
