In [396]:
#import the Natural Language Toolkit module
import nltk

#import pandas module and pandas functions
import pandas as pd
from pandas import Series, DataFrame

In [397]:
#we are importing the SMS spam collection dataset as a datafram
sms_dataset = pd.read_csv("SMS_dataset.csv", encoding="latin-1")
print(type(sms_dataset))

#this shows the top 5 rows of our dataframe as well as headers and index
sms_dataset.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [398]:
#this shows the last 5 rows of our dataframe as well as headers and index
sms_dataset.tail()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,
5571,ham,Rofl. Its true to its name,,,


In [399]:
#as you probably noticed, we have three columns at the end
#called "Unnamed: 2" - "Unnamed: 4" that do not contain important data
#and so we will clean up the dataframe further by removing those columns
sms_dataset.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4"], axis = 1, inplace = True)
#the default axis is 0, which is the index column

sms_dataset.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [400]:
#next, let's rename "v1" and "v2" headers to something more meaningful
sms_dataset = sms_dataset.rename(columns = {"v1":"sms_type","v2":"message"})
sms_dataset.head()

Unnamed: 0,sms_type,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [401]:
print(len(sms_dataset.index), "\n")
#so we have 5572 SMS messages in this dataframe

print(sms_dataset.loc[0]["message"])
print(type(sms_dataset.loc[0]["message"]))
print(sms_dataset.loc[5571]["sms_type"])
print(type(sms_dataset.loc[5571]["sms_type"]))
#the data in both sms_type and message columns are strings

5572 

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
<class 'str'>
ham
<class 'str'>


In [402]:
from nltk.corpus import stopwords

#stop words are some of the most common words used in a language
#in our case, the English language
#and they will be removed from the data before data processing
stop_words = set(stopwords.words('english'))

stop_words.update(["u"]) 
#"u" was added to the list of stopwords because
#it shows up so often in casual text messages,
#likely as often as "you" shows up in more formal communications

print(stop_words)

{'most', 'against', 'where', 'his', 'will', 'they', 'each', 'didn', 'through', 'some', 'same', 'yourselves', 'be', 'its', 'wasn', 'down', 'once', 'does', 'been', 'at', 'mustn', 'up', 'am', 'and', 'such', 'won', 'd', 'hadn', 'themselves', 'y', 'what', 'so', 'into', 'an', 'did', 'more', 'weren', 'ain', 'here', 'few', 'her', 'who', 'herself', 'to', 'he', 'being', 'him', 'nor', 'own', 'll', 'have', 'until', 'were', 'before', 'yourself', 'a', 'it', 've', 'ours', 'doing', 'then', 'hasn', 'the', 'doesn', 'which', 'during', 'for', 'this', 'o', 'out', 'my', 'whom', 'other', 'hers', 'don', 'while', 'isn', 'ma', 'couldn', 'ourselves', 'under', 'are', 'further', 'any', 'or', 'off', 'after', 'do', 'how', 're', 'in', 'but', 'these', 'we', 'too', 'their', 'again', 'below', 'haven', 'there', 'should', 'about', 'shouldn', 'you', 'i', 'had', 'over', 'wouldn', 'of', 'very', 'than', 'not', 'having', 'both', 'himself', 'by', 'has', 'our', 'with', 'when', 'if', 'all', 'm', 'is', 'why', 'no', 'only', 'from',

In [403]:
#to further pre-process our data, we will use the following

from nltk.tokenize import sent_tokenize, word_tokenize
#according to nltk documentation (http://www.nltk.org/api/nltk.tokenize.html)
#word_tokenize function should be applied to one sentence at a time
#however, this might have been fixed to work on a multi-sentence argument
#if https://github.com/nltk/nltk_book/issues/33 is true
#but since I haven't found official documentation (yet)
#we will run sent_tokenize before running word_tokenize on all sentences

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
#lemmatizer function finds the lemma of a word argument
#according to https://simple.wikipedia.org/wiki/Lemma_(linguistics) is 
#a word which stands at the head of a definition in a dictionary

#this function has a default part of speech parameter of noun
print(lemmatizer.lemmatize("mice"))
print(lemmatizer.lemmatize("clocks"))
print(lemmatizer.lemmatize("jumping", "v"))
print(lemmatizer.lemmatize("fastest", pos="a"))

mouse
clock
jump
fast


In [404]:
#for i from index 0 to index 5571
for i in range(len(sms_dataset.index)):
    #create a list of word tokens by
    
    #tokenizing each string in the "message" column into sentences
    #which returns a list of sentence strings
    
    #and then tokeninzing each sentence into words
    #which returns a list of word strings
    
    #next we take each word and find their lemmas
    #unfortunately, we will have to assume that all words are nouns
    word_tokens = [lemmatizer.lemmatize(word.lower()) 
                   for sent in sent_tokenize(sms_dataset.loc[i]["message"])
                   for word in word_tokenize(sent)]
    #then from the word_tokens list we take elements that are not stopwords
    #and add them to a new list
    without_stop = [w for w in word_tokens if w not in stop_words]
    #we convert this new list of words into a string
    without_stop_str = " ".join(without_stop)
    #and replace all messages with lemmatized versions of the original
    #that do not include stopwords
    sms_dataset.loc[i]["message"] = without_stop_str

sms_dataset.head(10)

Unnamed: 0,sms_type,message
0,ham,"go jurong point , crazy.. available bugis n gr..."
1,ham,ok lar ... joking wif oni ...
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,dun say early hor ... c already say ...
4,ham,"nah n't think go usf , life around though"
5,spam,freemsg hey darling 's 3 week 's word back ! '...
6,ham,even brother like speak . treat like aid patent .
7,ham,per request 'melle melle ( oru minnaminunginte...
8,spam,winner ! ! valued network customer selected re...
9,spam,mobile 11 month ? r entitled update latest col...


In [405]:
sms_dataset.tail()

Unnamed: 0,sms_type,message
5567,spam,2nd time tried 2 contact u. å£750 pound prize ...
5568,ham,ì_ b going esplanade fr home ?
5569,ham,"pity , * wa mood . ... suggestion ?"
5570,ham,guy bitching acted like 'd interested buying s...
5571,ham,rofl . true name


In [406]:
#now following the template set in the python programming nltk tutorial
#we format all the information in the dataframe into a list of tuples

#this shows how you access strings in the sms_type and message columns
for row in sms_dataset.iterrows():
    print(row[1][0])
    print(row[1][1], "\n")
    break

all_messages = []

for row in sms_dataset.iterrows():
    #we had to tokenize strings into lists,
    #convert lists into strings
    #and finally tokenize the cleaned-up string again
    #because pandas dataframes will not accept lists
    all_messages.append((word_tokenize(row[1][1]),row[1][0]))

print(all_messages[:6])

ham
go jurong point , crazy.. available bugis n great world la e buffet ... cine got amore wat ... 

[(['go', 'jurong', 'point', ',', 'crazy..', 'available', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'cine', 'got', 'amore', 'wat', '...'], 'ham'), (['ok', 'lar', '...', 'joking', 'wif', 'oni', '...'], 'ham'), (['free', 'entry', '2', 'wkly', 'comp', 'win', 'fa', 'cup', 'final', 'tkts', '21st', 'may', '2005', '.', 'text', 'fa', '87121', 'receive', 'entry', 'question', '(', 'std', 'txt', 'rate', ')', '&', 'c', "'s", 'apply', '08452810075over18', "'s"], 'spam'), (['dun', 'say', 'early', 'hor', '...', 'c', 'already', 'say', '...'], 'ham'), (['nah', "n't", 'think', 'go', 'usf', ',', 'life', 'around', 'though'], 'ham'), (['freemsg', 'hey', 'darling', "'s", '3', 'week', "'s", 'word', 'back', '!', "'d", 'like', 'fun', 'still', '?', 'tb', 'ok', '!', 'xxx', 'std', 'chgs', 'send', ',', 'å£1.50', 'rcv'], 'spam')]


In [416]:
#in case there was some pattern already set in the SMS dataset
#such as all spam type messages living the first half
#we are going to shuffle all the messages so that the different types
#are better distributed throughout the all_messages list

import random

random.shuffle(all_messages)
print(all_messages[0])
print(all_messages[5571])

(["'s", 'said', "'s", 'bad', 'dat', 'e', 'gal', 'know', '...', 'wat', '?'], 'ham')
(['course', '!', "n't", 'tease', '...', 'know', 'simply', 'must', 'see', '!', '*grins*', '...', 'keep', 'posted', 'prey', '...', '*loving', 'smile*', '*devouring', 'kiss*'], 'ham')


In [408]:
#now we will compile all words from all messages into a list
#in order to extract the most common ones to use as features
#that can help identify either spam or non-spam (ham)

all_words = []
#again, this is index 0 to index 5571
for i in range(len(all_messages)):
    #accessing the list of words for message with index i
    for j in range(len(all_messages[i][0])):
        #adding word from position j to the all_words list
        all_words.append(all_messages[i][0][j])
        
print(len(all_words))
#this just shows you that we have a lot of duplicate words
print(len(set(all_words)))

words_freq = nltk.FreqDist(all_words)
print(type(words_freq))
print(words_freq.most_common(25))

68090
9181
<class 'nltk.probability.FreqDist'>
[('.', 4933), (',', 1872), ('?', 1541), ('!', 1381), ('...', 1233), ('&', 916), (';', 764), (':', 717), ('call', 600), (')', 494), ("'s", 492), ('2', 485), ('get', 396), ("'m", 395), ('ur', 381), ("n't", 361), ('gt', 318), ('lt', 316), ('go', 303), ("''", 293), ('#', 287), ('free', 275), ('know', 269), ('4', 268), ('ok', 251)]


In [409]:
#taking only the 3000 most common words of 9181 from the FreqDist
top_3000_dict = dict(words_freq.most_common(3000)).keys()
#we create a feature list that will identify patterns in spam vs ham
word_features = list(top_3000_dict)

In [410]:
#based off of pythonprogramming tutorial code,
#we create a function using dictionary comprehension
#that will tell use which of these 3000 most common words
#exist in each of the messages

def find_features(document):
    words = set(document)
    return {w: (True if w in words else False) for w in word_features}
    

#original pythonprogramming tutorial code
# def find_features(document):
#     words = set(document)
#     features = {}
#     for w in word_features:
#         features[w] = (w in words)
#     return features

In [411]:
#implementing the find_features function defined above on all messages
featuresets = [(find_features(msg), type) for (msg, type) in all_messages]

print(featuresets[0], "\n") #is the same as
#print(find_features(all_messages[0][0]))




In [412]:
#set that we'll train our classifier with
training_set = featuresets[:4000]

#set that we'll test against
#message #4000 to #5572
testing_set = featuresets[4000:]

In [413]:
#Naive Bayes Classifier is popular machine learning algorithm for text classification
#reminder that this is supervised machine learning
#because we are feeding the computer already classified data before
#testing it on another set of data
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [414]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)

Classifier accuracy percent: 98.85496183206108


In [415]:
classifier.show_most_informative_features(15)
#shows ratio of occurences in spam to ham for the most valuable word features

Most Informative Features
                landline = True             spam : ham    =    121.0 : 1.0
                 voucher = True             spam : ham    =     99.8 : 1.0
                   nokia = True             spam : ham    =     82.8 : 1.0
                      uk = True             spam : ham    =     74.3 : 1.0
                delivery = True             spam : ham    =     65.8 : 1.0
                   await = True             spam : ham    =     65.8 : 1.0
                 attempt = True             spam : ham    =     65.8 : 1.0
                   apply = True             spam : ham    =     65.0 : 1.0
                 service = True             spam : ham    =     63.1 : 1.0
                     txt = True             spam : ham    =     62.8 : 1.0
                    rate = True             spam : ham    =     62.4 : 1.0
                   video = True             spam : ham    =     59.9 : 1.0
                  camera = True             spam : ham    =     57.3 : 1.0

In [417]:
#finally, you can save your classifiers with the pickle module
import pickle

with open("naivebayes.pickle","wb") as f:
    pickle.dump(classifier, f)
    
with open("naivebayes.pickle","rb") as f:
    classifier2 = pickle.load(f)

In [418]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier2, testing_set))*100)

Classifier accuracy percent: 98.85496183206108


* https://github.com/nltk/nltk_book/issues/338
* http://stackoverflow.com/questions/19373296/consequences-of-abusing-nltks-word-tokenizesent
* http://stackoverflow.com/questions/35345761/python-re-split-vs-nltk-word-tokenize-and-sent-tokenize
* http://www.nltk.org/api/nltk.tokenize.html
* http://www.nltk.org/book/ch03.html