### Reading SMS

In [2]:
messages = [line for line in open('SMSSpamCollection')]

print(len(messages), '\n')

for message_no, message in enumerate(messages[:3]):
    print(message_no, message)

5574 

0 ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...

1 ham	Ok lar... Joking wif u oni...

2 spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's




### Converting into Pandas 

In [3]:
import numpy as np
import pandas as pd
import csv
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix

In [4]:
# Quoting - Remove those words whcich are not part of English
# e.g. German, French etc.

messages = pd.read_csv('SMSSpamCollection', sep = '\t', quoting = csv.QUOTE_NONE, names = ['label', 'message'])
messages.head(5)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
messages.message.values[1]

'Ok lar... Joking wif u oni...'

### Exploring Textual Data

In [6]:
# Group By

print('Aggregate Statistics of Messages : ')
messages.groupby('label').describe()

Aggregate Statistics of Messages : 


Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4827,4518,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [7]:
messages.describe()      # Describe behaves according to Data-Types, we have here is textutal data

Unnamed: 0,label,message
count,5574,5574
unique,2,5171
top,ham,"Sorry, I'll call later"
freq,4827,30


#### Length of Messages

In [8]:
print('Length of initial few Messages : ')

messages['Length'] = messages['message'].map(lambda text: len(text))
messages.head()

Length of initial few Messages : 


Unnamed: 0,label,message,Length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [9]:
print(messages.Length.describe())

count    5574.000000
mean       80.478292
std        59.848302
min         2.000000
25%        36.000000
50%        62.000000
75%       122.000000
max       910.000000
Name: Length, dtype: float64


In [10]:
print(list(messages.message[messages.Length > 900]))

["For me the love should start with attraction.i should feel that I need her every time around me.she should be the first thing which comes in my thoughts.I would start the day and end it with her.she should be there every time I dream.love will be then when my every breath has her name.my life should happen around her.my life will be named to her.I would cry for her.will give all my happiness and take all her sorrows.I will be ready to fight with anyone for her.I will be in love when I will be doing the craziest things for her.love will be when I don't have to proove anyone that my girl is the most beautiful lady on the whole planet.I will always be singing praises for her.love will be when I start up making chicken curry and end up makiing sambar.life will be the most beautiful then.will get every morning and thank god for the day because she is with me.I would like to say a lot..will tell later.."]


#### Tokenization 

In [11]:
from textblob import TextBlob

def split_into_tokens(message):
    return TextBlob(message).words

In [12]:
TextBlob('I am into Machine Learning. I\'d love to play cricket').words

WordList(['I', 'am', 'into', 'Machine', 'Learning', 'I', "'d", 'love', 'to', 'play', 'cricket'])

In [13]:
print('Tokenized Messages : \n')
print(messages.message.head().apply(split_into_tokens))

Tokenized Messages : 

0    [Go, until, jurong, point, crazy, Available, o...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, in, 2, a, wkly, comp, to, win, F...
3    [U, dun, say, so, early, hor, U, c, already, t...
4    [Nah, I, do, n't, think, he, goes, to, usf, he...
Name: message, dtype: object


#### Part of Speech Tags (POS) 

In [14]:
# Penn Treebank Project for Tagging of words - o/p will be (word, tag)
# Tags = Noun, Vern and Adjective

print(TextBlob('This is for checking tags here').tags)

[('This', 'DT'), ('is', 'VBZ'), ('for', 'IN'), ('checking', 'VBG'), ('tags', 'NNS'), ('here', 'RB')]


In [15]:
"""
Verbs are the Words having tags [VB, VBD, VBG, VBN, VBP, VBZ]
Adjective are the Words having tags [JJ, JJR, JJS]
Nouns are the Words having tags [NN, NNP, NNS]
"""

'\nVerbs are the Words having tags [VB, VBD, VBG, VBN, VBP, VBZ]\nAdjective are the Words having tags [JJ, JJR, JJS]\nNouns are the Words having tags [NN, NNP, NNS]\n'

### Lemmatization 

In [16]:
def split_into_lemmas(message):
    words = TextBlob(message).words
    lemma_form = [word.lemma for word in words]
    return lemma_form

In [17]:
print('After lemmatization, messages are : \n')
print(messages.message.head().apply(split_into_lemmas))

After lemmatization, messages are : 

0    [Go, until, jurong, point, crazy, Available, o...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, in, 2, a, wkly, comp, to, win, F...
3    [U, dun, say, so, early, hor, U, c, already, t...
4    [Nah, I, do, n't, think, he, go, to, usf, he, ...
Name: message, dtype: object


### Label Encoding 

#### Vectorization - CountVectorizer - Bag of Words (BOW)

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
# General Example available on SKLEARN CountVectorizer page

corpus = [
     'This is the first document.',
     'This document is the second document.',
     'And this is the third one.',
     'Is this the first document?',
]

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names())
print(X.toarray())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


In [45]:
# Convert SMS's to vectors

print('Data to Vectors - Convert the messages to a matrix of token counts: ')

# Creating object
vectorizer = CountVectorizer(analyzer=split_into_lemmas)

# Creating a sort of Model (Transformer in NLP)
bow_transformer = vectorizer.fit(messages['message'])
#X = vectorizer.fit(messages['message'])

Data to Vectors - Convert the messages to a matrix of token counts: 


In [29]:
# Vocabulary - All unique words in all messages

print(len(bow_transformer.vocabulary_))

11010


In [51]:
# Taking single message into consideration

message4 = messages.message[3]                # Fourth message in the file
                                              # U dun say so early hor... U c already then say...

# Tranform Messages into numerical form - term frequency value
bow4 = bow_transformer.transform([message4])

print(bow4)
print(bow4.shape)


print('\n\nSanity Checking for words occuring 2 times: \n')

print(vectorizer.get_feature_names()[4189])
print(vectorizer.get_feature_names()[9280])

  (0, 4189)	2
  (0, 4762)	1
  (0, 5363)	1
  (0, 6219)	1
  (0, 6243)	1
  (0, 7137)	1
  (0, 9280)	2
  (0, 9589)	1
  (0, 10054)	1
(1, 11010)


Sanity Checking for words occuring 2 times: 

U
say


In [34]:
# Taking all messages into consideration

print('Using BOW transformer, tranform all messages : \n')

# Tranform Messages in numerical form
messages_bow = bow_transformer.transform(messages['message'])

print('Sparse matrix shape : ', messages_bow.shape)
print('Number of Non-Zeros : ', messages_bow.nnz)

# Sparcity is the Ratio of ZEROS
print('Sparcity : %.2f%%' % (100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1])))

Using BOW transformer, tranform all messages : 

Sparse matrix shape :  (5574, 11010)
Number of Non-Zeros :  81623
Sparcity : 0.13%


#### Vectorization - TF IDF

In [40]:
from sklearn.feature_extraction.text import TfidfTransformer

In [52]:
print('TF-IDF fit and transform : ')

# Create object
transformer = TfidfTransformer()

# Creating a sort of Model (Transformer in NLP)
tfidf_transformer = transformer.fit(messages_bow)

TF-IDF fit and transform : 


In [53]:
# Taking single message into consideration
# Fourth message

# Tranform Messages into numerical form - Tf-Idf
tfidf4 = tfidf_transformer.transform(bow4)

print(tfidf4)

  (0, 10054)	0.22510385070095637
  (0, 9589)	0.1955442748962185
  (0, 9280)	0.49597495370832545
  (0, 7137)	0.4269339327922034
  (0, 6243)	0.3100112284407115
  (0, 6219)	0.2913528957227454
  (0, 5363)	0.2860779240943588
  (0, 4762)	0.25892595706356525
  (0, 4189)	0.391088549792437
