In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('smsspamcollection/SMSSpamCollection', sep = '\t', names = ['label', 'message'])

In [3]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.shape

(5572, 2)

 ### Data Cleaning

In [8]:
import re 
import nltk 
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\91773\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [9]:
ps = PorterStemmer()

In [10]:
corpus = []
for i in range(len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['message'][i])
    review = review.lower()
    review = nltk.word_tokenize(review)
    review = [ps.stem(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
cv = CountVectorizer(max_features = 5000)

In [13]:
X = cv.fit_transform(corpus).toarray()

In [14]:
X.shape

(5572, 5000)

In [15]:
Y = pd.get_dummies(df['label'])

In [16]:
Y = Y.iloc[:, 1]

In [18]:
Y.head()

0    0
1    0
2    1
3    0
4    0
Name: spam, dtype: uint8

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 123)

In [21]:
from sklearn.naive_bayes import MultinomialNB

In [22]:
spam_detect_model = MultinomialNB().fit(X_train, Y_train)

In [23]:
Y_pred = spam_detect_model.predict(X_test)

In [24]:
a = {'Actual': Y_test, 'Predicted': Y_pred}

In [25]:
pd.DataFrame(a).tail(50)

Unnamed: 0,Actual,Predicted
1003,0,0
343,0,0
4615,0,0
4318,0,0
574,0,0
4717,0,0
5313,0,0
5257,0,0
2154,0,0
3929,0,0


In [26]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [27]:
confusion_matrix(Y_test, Y_pred)

array([[944,  18],
       [  8, 145]], dtype=int64)

In [28]:
accuracy_score(Y_test, Y_pred)

0.9766816143497757