In [1]:
import pandas as pd, numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv("spam.csv", encoding = "latin-1")
data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1, inplace = True)
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
data.rename(columns = {"v1": "label", "v2": "text"}, inplace = True)
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Data Analysis

In [4]:
data.describe()

Unnamed: 0,label,text
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [5]:
data.groupby("label").describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [6]:
# make a new column to detect how long the text messages are
data["length"] = data["text"].apply(len)
data.head()

Unnamed: 0,label,text,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [7]:
data.length.describe()

count    5572.000000
mean       80.118808
std        59.690841
min         2.000000
25%        36.000000
50%        61.000000
75%       121.000000
max       910.000000
Name: length, dtype: float64

In [8]:
# let's see what text is present 
data[data["length"]==910]["text"].iloc[0]

"For me the love should start with attraction.i should feel that I need her every time around me.she should be the first thing which comes in my thoughts.I would start the day and end it with her.she should be there every time I dream.love will be then when my every breath has her name.my life should happen around her.my life will be named to her.I would cry for her.will give all my happiness and take all her sorrows.I will be ready to fight with anyone for her.I will be in love when I will be doing the craziest things for her.love will be when I don't have to proove anyone that my girl is the most beautiful lady on the whole planet.I will always be singing praises for her.love will be when I start up making chicken curry and end up makiing sambar.life will be the most beautiful then.will get every morning and thank god for the day because she is with me.I would like to say a lot..will tell later.."

### Text Pre-processing

In [9]:
from nltk.corpus import stopwords
import string

def text_pre_process(message):
    remove_punc =[char for char in message if char not in string.punctuation]
    remove_punc=''.join(remove_punc)
    return [word for word in remove_punc.split() if word.lower() not in stopwords.words('english')]

In [10]:
# tokenize the text
data["text"].head(5).apply(text_pre_process)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: text, dtype: object

### Vectorization

Now, we have the text as list of tokens (also known as lemmas) and need to convert each of those texts into a vector for that we can consider SciKit Learn's algorithm models.

Convert each text, represented as a list of tokens (lemmas) into a vector that machine learning models can understand.

Let's do it in three steps using the bag-of-words model:

Count a number of times a word occurs in each message (Known as term frequency)

Weigh the counts, so that frequent tokens get lower weight (inverse document frequency)

Normalize the vectors to unit length, to abstract from the original text length (L2 norm)

In [11]:
#  take one text and get its Bag-of-Words counts as a vector
from sklearn.feature_extraction.text import CountVectorizer

bag_of_words = CountVectorizer(analyzer=text_pre_process).fit(data['text'])
print(len(bag_of_words.vocabulary_))

11304


In [12]:
text_4 = data["text"][3]
print(text_4)

U dun say so early hor... U c already then say...


In [13]:
# vector representation
bag_of_words_4 = bag_of_words.transform([text_4])
print(bag_of_words_4)
print(bag_of_words_4.shape)

  (0, 3996)	2
  (0, 4551)	1
  (0, 5179)	1
  (0, 6118)	1
  (0, 6136)	1
  (0, 7091)	1
  (0, 9445)	2
(1, 11304)


In [14]:
print(bag_of_words.get_feature_names()[3996])

U


In [15]:
# use transform on Bag-of-Words transformed object and transform the entire datafrmae  
text_bag_of_words = bag_of_words.transform(data["text"])

#### Let's get some insight into TF-IDF

Term Frequency(TF):  measures how frequently a term occurs in a document. Since every document is different in length, it is possible that a term would appear much more times in long documents than shorter ones. Thus, the term frequency is often divided by the document length (aka. the total number of terms in the document) as a way of normalization:
     TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)
     
Inverse Document Frequency(IDF): measures how important a term is. While computing TF, all terms are considered equally important. However it is known that certain terms, such as "is", "of", and "that", may appear a lot of times but have little importance. Thus we need to weigh down the frequent terms while scale up the rare ones, by computing the following:
     IDF(t) = log_e(Total number of documents / Number of documents with term t in it)

In [16]:
from sklearn.feature_extraction.text import TfidfTransformer

tf_idf=TfidfTransformer().fit(text_bag_of_words)
tf_idf_4 = tf_idf.transform(bag_of_words_4)
print(tf_idf_4)

  (0, 9445)	0.5391515179363724
  (0, 7091)	0.438375519235493
  (0, 6136)	0.3183143011307023
  (0, 6118)	0.2991551295536476
  (0, 5179)	0.2969195675145299
  (0, 4551)	0.26585776633899993
  (0, 3996)	0.40924708612713756


In [17]:
text_tf_idf=tf_idf.transform(text_bag_of_words)
print(text_tf_idf.shape)

(5572, 11304)


#### Train a model using Naive Bayes classifier

In [18]:
from sklearn.naive_bayes import MultinomialNB

spam_detect_model = MultinomialNB().fit(text_tf_idf,data["label"])

In [19]:
print("predicted label is",spam_detect_model.predict(tf_idf_4)[0])
print("expected label is",data.label[3])

predicted label is ham
expected label is ham


#### Let's see how well the model performs on the overall dataset

In [20]:
predict_all = spam_detect_model.predict(text_tf_idf)
print(predict_all)

['ham' 'ham' 'spam' ... 'ham' 'ham' 'ham']


In [21]:
from sklearn.model_selection import train_test_split

train_txt, test_txt, train_label, test_label = train_test_split(data["text"], data["label"], test_size=0.2)

In [22]:
# use of pipeline allows to store pipeline of workflow and also to set up all the transformations that we will be doing on data for future use
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
   ( 'bow',CountVectorizer(analyzer=text_pre_process)),
    ('tfidf',TfidfTransformer()),
    ('classifier',MultinomialNB()),
])

In [23]:
pipeline.fit(train_txt, train_label)

Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer=<function text_pre_process at 0x000002793ABAD400>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preproc...f=False, use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [24]:
predictions = pipeline.predict(test_txt)

In [25]:
from sklearn.metrics import classification_report

print(classification_report(predictions,test_label))

              precision    recall  f1-score   support

         ham       1.00      0.95      0.98       994
        spam       0.72      1.00      0.83       121

   micro avg       0.96      0.96      0.96      1115
   macro avg       0.86      0.98      0.90      1115
weighted avg       0.97      0.96      0.96      1115

