In [1]:
import numpy as np
import pandas as pd

In [3]:
data = pd.read_csv('SMSSpamCollection', sep='\t', names=['label','messages'])

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
label       5572 non-null object
messages    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB


In [6]:
data.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [7]:
data.head()

Unnamed: 0,label,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
#Natural Language Processing Toolkit -- Package in Python specifically for NLP operations
import nltk

#Download corpus for stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/prashantn/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [15]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [19]:
import string
from nltk.corpus import stopwords

def textPreprocessing(feature):
    #Remove punctuations
    removePunctuations = [character for character in feature if character not in string.punctuation]
    #Convert Characters to Sentences
    sentencesWithoutPunctuations = ''.join(removePunctuations)
    #Convert Sentences into words
    words = sentencesWithoutPunctuations.split(" ")
    #Remove stopwords
    removeStopwords = [word for word in words if word.lower() not in stopwords.words('english')]
    #Return the final words
    
    return removeStopwords

In [21]:
#Create Bag of Words

from sklearn.feature_extraction.text import CountVectorizer
wordVector = CountVectorizer(analyzer=textPreprocessing)
finalWordVector = wordVector.fit(data['messages'])

In [24]:
%%time
bagOfWords = wordVector.transform(data['messages'])

CPU times: user 8.84 s, sys: 1.77 s, total: 10.6 s
Wall time: 10.7 s


In [25]:
bagOfWords

<5572x11427 sparse matrix of type '<class 'numpy.int64'>'
	with 51591 stored elements in Compressed Sparse Row format>

In [26]:
#Apply TFIDF on BOW

from sklearn.feature_extraction.text import TfidfTransformer
tfIdfObject = TfidfTransformer().fit(bagOfWords)

In [28]:
featureArray = tfIdfObject.transform(bagOfWords)

In [29]:
featureArray

<5572x11427 sparse matrix of type '<class 'numpy.float64'>'
	with 51591 stored elements in Compressed Sparse Row format>

In [36]:
#Implement Classifier using Naive Bayes Algo
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(featureArray,data['label'])

In [37]:
model

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [38]:
model.score(featureArray,data['label'])

0.9791816223977028

In [39]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(data['label'], model.predict(featureArray))
cm

array([[4825,    0],
       [ 116,  631]])

In [None]:
#I am not fine if Ham goes to Spam folder !!!

In [43]:
#Deployment Example

inputSMS = input("Enter SMS to detect:")
preprocessOutput = textPreprocessing(inputSMS)
bow = finalWordVector.transform(preprocessOutput)
feature = tfIdfObject.transform(bow)
pred = model.predict(feature)
print(pred[0])

Enter SMS to detect:Lottery Win Guaranteed
ham
