# Analysing Spam Collection Data

***

In [97]:
#Get the spam data collection 
import pandas as pd
import string
from nltk.corpus import stopwords   

In [98]:
df_spamCollection = pd.read_csv('SpamCollection', sep='\t', names = ['label', 'email'])

In [99]:
df_spamCollection.head()

Unnamed: 0,label,email
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [100]:
df_spamCollection.describe()


Unnamed: 0,label,email
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [101]:
df_spamCollection.groupby('label').describe()
df_spamCollection.groupby('email').describe()

Unnamed: 0_level_0,label,label,label,label
Unnamed: 0_level_1,count,unique,top,freq
email,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
&lt;#&gt; in mca. But not conform.,1,1,ham,1
&lt;#&gt; mins but i had to stop somewhere first.,1,1,ham,1
&lt;DECIMAL&gt; m but its not a common car here so its better to buy from china or asia. Or if i find it less expensive. I.ll holla,1,1,ham,1
and picking them up from various points,1,1,ham,1
"came to look at the flat, seems ok, in his 50s? * Is away alot wiv work. Got woman coming at 6.30 too.",1,1,ham,1
...,...,...,...,...
Ü thk of wat to eat tonight.,1,1,ham,1
Ü v ma fan...,1,1,ham,1
Ü wait 4 me in sch i finish ard 5..,1,1,ham,1
… and don‘t worry we‘ll have finished by march … ish!,1,1,ham,1


In [102]:

df_spamCollection['length'] = df_spamCollection['email'].apply(len)

In [103]:
#view response 
df_spamCollection.head()

Unnamed: 0,label,email,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [104]:
#define a function to get rid of stopwords present in the messages
def message_text_process(message):
    no_punctuation = [char for char in message if char not in string.punctuation]
    no_punctuation = ''.join(no_punctuation)
    return [word for word in no_punctuation.split() if word.lower() not in stopwords.words('english')]

In [112]:
df_spamCollection['email'].head(5).apply(message_text_process)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: email, dtype: object

In [120]:
#start text processing with vectorizer 
from sklearn.feature_extraction.text import CountVectorizer

In [121]:
#use bag of words by applying the function and fit the data into it
bag_of_words_transformer = CountVectorizer(analyzer=message_text_process).fit(df_spamCollection['email'])

In [138]:
#print length of bag of words stored in the vocabulary_ attribute
print(len(bag_of_words_transformer.vocabulary_))
message_bag_of_words = bag_of_words_transformer.transform(df_spamCollection['email'])

11425


In [141]:
#apply tfidf transformer and fit the bag of words into it (transformed version)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(message_bag_of_words)


In [142]:
#print shape of the tfidf 
message_tfidf = tfidf_transformer.transform(message_bag_of_words)
print(message_tfidf.shape)

(5572, 11425)


In [149]:
#choose naive Bayes model to detect the spam and fit the tfidf data into it
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(message_tfidf, df_spamCollection['label'])

In [152]:
#check model for the predicted and expected value say for message#2 and message#5
message = df_spamCollection['email'][2]
bag_of_words_for_message = bag_of_words_transformer.transform([message])
tfidf = tfidf_transformer.transform(bag_of_words_for_message)

print('predicted',spam_detect_model.predict(tfidf)[0])
print('expected',spam_detect_model.predict(tfidf)[0])

predicted spam
expected spam
