In [54]:
import numpy as np 
import pandas as pd

In [55]:
messages = pd.read_csv('SMSSpamCollection',sep = '\t',names=['label','message'])
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [56]:
messages.shape

(5572, 2)

In [57]:
import re 
import nltk
from nltk.corpus import stopwords
from nltk.stem import  PorterStemmer,WordNetLemmatizer

# Text Preprocessing

In [58]:
ps = PorterStemmer()
wordnet = WordNetLemmatizer()
stopwords = stopwords.words('English')
corpus = []
for i in range(messages.shape[0]):
    review = re.sub(r'[^a-zA-Z]',' ',messages.iloc[i,1])
    review = review.lower().split()
    review = [ps.stem(word) for word in review if word not in stopwords]
    review = ' '.join(review)
    corpus.append(review)

In [59]:
messages['Preprocessed_message'] = corpus
messages.head()

Unnamed: 0,label,message,Preprocessed_message
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think go usf life around though


In [60]:
len(corpus)

5572

In [61]:
from sklearn.feature_extraction.text import  TfidfVectorizer

In [62]:
cv = TfidfVectorizer(max_features=5000)
X = cv.fit_transform(corpus).toarray()

In [63]:
X.shape

(5572, 5000)

In [64]:
y = pd.get_dummies(messages.label)
y = y.spam.values
y.shape

(5572,)

In [65]:
from sklearn.model_selection import  train_test_split 

In [66]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0,stratify=y)
print('Shape of training data:',X_train.shape)
print('Shape of testing data:',X_test.shape)

Shape of training data: (4457, 5000)
Shape of testing data: (1115, 5000)


In [67]:
from sklearn.naive_bayes import  MultinomialNB

In [68]:
spam_detect_model = MultinomialNB()
spam_detect_model.fit(X_train,y_train)

MultinomialNB()

In [69]:
y_pred = spam_detect_model.predict(X_test)

In [70]:
spam_detect_model.score(X_test,y_test)

0.9704035874439462

In [71]:
from sklearn.metrics import  classification_report,confusion_matrix

In [72]:
confusion_matrix(y_test,y_pred)

array([[966,   0],
       [ 33, 116]], dtype=int64)

In [73]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       1.00      0.78      0.88       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115

