In [1]:
# import the required libraries
import pandas as pd
import string
from nltk.corpus import stopwords

In [2]:
#Get the spam data collection 
df_spam_data_collection = pd.read_csv('SpamCollection', sep = '\t', names = ['response', 'messages'])

In [3]:
df_spam_data_collection.head()

Unnamed: 0,response,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#view response 
df_spam_data_collection.describe()

Unnamed: 0,response,messages
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [5]:
#view response using group by and describe method
df_spam_data_collection.groupby('response').describe()

Unnamed: 0_level_0,messages,messages,messages,messages
Unnamed: 0_level_1,count,unique,top,freq
response,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [6]:
#Verify length of the messages and also add it as a new column 
df_spam_data_collection['messages_length'] = df_spam_data_collection['messages'].apply(len)  

In [7]:
# view first five record of dataframe with messages_length
df_spam_data_collection.head()

Unnamed: 0,response,messages,messages_length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [8]:
#define a function to get rid of stopwords present in the messages
def message_text_process(msg):
    
    # check the characters to see if there are punctuations
    no_punctuations = [char for char in msg if char not in string.punctuation]    
    
    # now join or form the sentence
    no_punctuations = ''.join(no_punctuations)
    
#     return no_punctuations
    # now remove stopwors from messages
    return [word for word in no_punctuations.split() if word.lower() not in stopwords.words('english')]


In [9]:
# verify function its working or not. Let's verify
print(message_text_process("Hi my name is muzzamil!"))
print("Its Working")

['Hi', 'name', 'muzzamil']
Its Working


In [10]:
# let's apply this gunction on messages
df_spam_data_collection['messages'] = df_spam_data_collection['messages'].apply(message_text_process)

In [11]:
# view the head of messages after applying preprocessing function
df_spam_data_collection['messages'].head()

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: messages, dtype: object

In [12]:
#start text processing with vectorizer 
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
#use bag of words by applying the function and fit the data into it
bag_of_words_transformer = CountVectorizer(analyzer=message_text_process).fit(df_spam_data_collection['messages'])

In [14]:
#print length of bag of words stored in the vocabulary_ attribute
print(len(bag_of_words_transformer.vocabulary_))

5114


In [15]:
# store bag of words for messages using transform method
messages_bagofwords = bag_of_words_transformer.transform(df_spam_data_collection['messages'])
print(messages_bagofwords)

  (0, 802)	1
  (1, 2060)	1
  (2, 729)	1
  (3, 2839)	1
  (4, 1825)	1
  (5, 719)	1
  (6, 638)	1
  (7, 4366)	1
  (8, 2953)	1
  (9, 4221)	1
  (10, 1386)	1
  (11, 2328)	1
  (12, 2788)	1
  (13, 1472)	1
  (14, 448)	1
  (15, 3124)	1
  (16, 1955)	1
  (17, 615)	1
  (18, 695)	1
  (19, 622)	1
  (20, 4571)	1
  (21, 1483)	1
  (22, 5105)	1
  (23, 110)	1
  (24, 685)	1
  :	:
  (5547, 3556)	1
  (5548, 4800)	1
  (5549, 4058)	1
  (5550, 429)	1
  (5551, 3064)	1
  (5552, 4500)	1
  (5553, 963)	1
  (5554, 3040)	1
  (5555, 3214)	1
  (5556, 3270)	1
  (5557, 4192)	1
  (5558, 2458)	1
  (5559, 3393)	1
  (5560, 202)	1
  (5561, 773)	1
  (5562, 2068)	1
  (5563, 222)	1
  (5564, 3682)	1
  (5565, 1291)	1
  (5566, 2267)	1
  (5567, 46)	1
  (5568, 5096)	1
  (5569, 2187)	1
  (5570, 3901)	1
  (5571, 2306)	1


In [16]:
#apply tfidf transformer and fit the bag of words into it (transformed version)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer().fit(messages_bagofwords)

In [17]:
#print shape of the tfidf 
messages_tfidf = tfidf.transform(messages_bagofwords)
print(messages_tfidf.shape)

(5572, 5114)


In [18]:
#choose naive Bayes model to detect the spam and fit the tfidf data into it
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(messages_tfidf, df_spam_data_collection['response'])

In [19]:
#check model for the predicted and expected value say for message#2 and message#5
message = df_spam_data_collection['messages'][4]
bag_of_words_messages = bag_of_words_transformer.transform([message])
tfidf_message = tfidf.transform(bag_of_words_messages)

In [20]:
print('Actual or Expected Message', df_spam_data_collection.response[4])
print('Pridicted Message', spam_detect_model.predict(tfidf_message)[0])

Actual or Expected Message ham
Pridicted Message ham
