In [139]:
import nltk
import pandas as pd
import numpy as np

In [140]:
data = pd.read_csv('SMSSpamCollection',sep='\t',names=['label','message'])
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [141]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [142]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk import re

In [143]:
portstem = PorterStemmer()

In [144]:
data_words = []
for i in data['message']:
    i = re.sub('[^a-zA-Z]',' ',i)
    words = word_tokenize(i)
    final_word = []
    for j in words:
        if (j.lower() not in stopwords):
            final_word.append(portstem.stem(j.lower()))
    final_word = ' '.join(final_word)
    data_words.append(final_word)
data['message'] = data_words
data.head()

Unnamed: 0,label,message
0,ham,go jurong point crazi avail bugi n great world...
1,ham,ok lar joke wif u oni
2,spam,free entri wkli comp win fa cup final tkt st m...
3,ham,u dun say earli hor u c alreadi say
4,ham,nah think goe usf live around though


In [145]:
#applying bag of words model 
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500,ngram_range=(1,2),binary=True)

In [146]:
X = (cv.fit_transform(data['message']).toarray())

In [147]:
cv.vocabulary_

{'go': 807,
 'point': 1603,
 'crazi': 452,
 'avail': 120,
 'bugi': 229,
 'great': 859,
 'world': 2432,
 'la': 1089,
 'cine': 345,
 'got': 849,
 'wat': 2340,
 'ok': 1472,
 'lar': 1099,
 'joke': 1058,
 'wif': 2394,
 'free': 726,
 'entri': 617,
 'wkli': 2419,
 'comp': 401,
 'win': 2399,
 'cup': 468,
 'final': 689,
 'st': 1960,
 'may': 1260,
 'text': 2077,
 'receiv': 1705,
 'question': 1668,
 'std': 1973,
 'txt': 2198,
 'rate': 1681,
 'appli': 83,
 'free entri': 731,
 'entri wkli': 619,
 'std txt': 1974,
 'txt rate': 2205,
 'rate appli': 1683,
 'dun': 579,
 'say': 1798,
 'earli': 588,
 'alreadi': 54,
 'nah': 1384,
 'think': 2101,
 'goe': 827,
 'usf': 2276,
 'live': 1167,
 'around': 96,
 'though': 2110,
 'freemsg': 740,
 'hey': 925,
 'darl': 489,
 'week': 2363,
 'word': 2425,
 'back': 138,
 'like': 1152,
 'fun': 760,
 'still': 1975,
 'tb': 2050,
 'xxx': 2465,
 'send': 1828,
 'even': 626,
 'brother': 218,
 'speak': 1942,
 'treat': 2173,
 'per': 1538,
 'request': 1739,
 'set': 1845,
 'callert

In [148]:
#Output feature
target = data['label']

In [149]:
data['label'] = target.str.replace('ham','0').str.replace('spam','1').astype(int)
data.head()

Unnamed: 0,label,message
0,0,go jurong point crazi avail bugi n great world...
1,0,ok lar joke wif u oni
2,1,free entri wkli comp win fa cup final tkt st m...
3,0,u dun say earli hor u c alreadi say
4,0,nah think goe usf live around though


In [150]:
column = {
    'label':'spam',
    'message':'message'
}
data = data.rename(columns=column)

In [151]:
y = data['spam']

In [152]:
#train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [153]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(X_train,y_train)
y_pred = model.predict(X_test)

In [155]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1460
           1       0.97      0.92      0.94       212

    accuracy                           0.99      1672
   macro avg       0.98      0.96      0.97      1672
weighted avg       0.98      0.99      0.98      1672

