In [1]:
#Program 4 - Preprocessing the data

In [2]:
#Importing Data

In [3]:
#importing pandas library
import pandas as pd

#default colwidth is 50 characters, set_option allows to define custom no. of characters to be displayed within each column of a dataframe
pd.set_option('display.max_colwidth',100)

#importing data
data = pd.read_csv("E:/Training/Certifications - LinkedIn/NLP/Dataset/SMSSpamCollection.tsv",header=None, sep='\t', names=['Label','Text'])

#displaying first 5 rows of the dataframe
data.head()

Unnamed: 0,Label,Text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
2,ham,"Nah I don't think he goes to usf, he lives around here though"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [4]:
#Removing Punctuation

In [5]:
#importing string package
import string

#extracting list of built-in punctuations
punctuations = string.punctuation

#creating a custom method for removing punctuation
#"" defines to apply join as is
#join function helps to join each character in text returned (by the condition) into words to return original sentences
def remove_punctuation(text):
    text_nopunctuation = "".join([char for char in text if char not in punctuations])
    return text_nopunctuation

#applying the method
data['Text_NoPunctuation'] = data['Text'].apply(lambda x: remove_punctuation(x))

#displaying first 5 rows of the dataframe after processing
data.head()

Unnamed: 0,Label,Text,Text_NoPunctuation
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL


In [6]:
#Performing Tokenization 

In [7]:
#importing regular expression package
import re

#creating tokenization method (custom)
def tokenization(text):
    tokens = re.split('\W+',text)
    return tokens

#applying the method (custom)
#lower() method transforms all text taken under consideration for tokenization into lower case as Python is case-sensitive
data['Text_Tokenized'] = data['Text_NoPunctuation'].apply(lambda x: tokenization(x.lower()))

#displaying first 5 rows of the dataframe after processing
data.head()

Unnamed: 0,Label,Text,Text_NoPunctuation,Text_Tokenized
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your...,"[ive, been, searching, for, the, right, words, to, thank, you, for, this, breather, i, promise, ..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to..."
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[i, have, a, date, on, sunday, with, will]"


In [8]:
#Removing Stopwords

In [9]:
#importing nltk library
import nltk

#extracting list of built-in stopwords for english language
stopwords = nltk.corpus.stopwords.words('english')

#creating a custom method for removing stopwords
def remove_stopwords(tokenized_list):
    text_nostopwords = [word for word in tokenized_list if word not in stopwords]
    return text_nostopwords

#applying the method
data['Text_NoStopwords'] = data['Text_Tokenized'].apply(lambda x : remove_stopwords(x))

#displaying first 5 rows of the dataframe after processing
data.head()

Unnamed: 0,Label,Text,Text_NoPunctuation,Text_Tokenized,Text_NoStopwords
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your...,"[ive, been, searching, for, the, right, words, to, thank, you, for, this, breather, i, promise, ...","[ive, searching, right, words, thank, breather, promise, wont, take, help, granted, fulfil, prom..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]","[even, brother, like, speak, treat, like, aids, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[i, have, a, date, on, sunday, with, will]","[date, sunday]"
