In [1]:
#Program 5 - Stemming the data

In [2]:
#Importing Data

In [3]:
#importing pandas library
import pandas as pd

#default colwidth is 50 characters, set_option allows to define custom no. of characters to be displayed within each column of a dataframe
pd.set_option('max_colwidth',100)

#importing the data into a dataframe
data = pd.read_csv("E:/Training/Certifications - LinkedIn/NLP/Dataset/SMSSpamCollection.tsv", sep='\t', names=['Label','Text'])

#displaying first 5 rows of the dataframe
data.head()

Unnamed: 0,Label,Text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
2,ham,"Nah I don't think he goes to usf, he lives around here though"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [4]:
#Removing punctuation, tokenizing and removing stopwords from data

In [5]:
#importing required packages
import string
import re
import nltk

#extracting built-in punctuations and stopwords
punctuation = string.punctuation
stopwords = nltk.corpus.stopwords.words('english')

#creating a custom function to remove punctuation, tokenize and remove stopwords from the text
def clean_data(text):
    text_nopunctuation = "".join([char for char in text if char not in punctuation])
    text_tokenized = re.split('\W+',text_nopunctuation)
    text_nostopwords = [word for word in text_tokenized if word not in stopwords]
    return text_nostopwords

#applying the method
data['Cleaned_Text'] = data["Text"].apply(lambda x: clean_data(x.lower()))

#displaying first 5 rows of the dataframe after processing
data.head()

Unnamed: 0,Label,Text,Cleaned_Text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,"[ive, searching, right, words, thank, breather, promise, wont, take, help, granted, fulfil, prom..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
2,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, brother, like, speak, treat, like, aids, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[date, sunday]"


In [6]:
#Stemming the data

In [7]:
#calling (creating an object for) PorterStemmer Class of nltk package
ps = nltk.PorterStemmer()

#creating a method to perform stemming on cleaned text
def stemming(tokenized_text):
    text_stemmed = [ps.stem(word) for word in tokenized_text]
    return text_stemmed

#applying method
data['Stemmed_Text'] = data['Cleaned_Text'].apply(lambda x:stemming(x))

#displaying first 5 rows of the dataframe
data.head()

Unnamed: 0,Label,Text,Cleaned_Text,Stemmed_Text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,"[ive, searching, right, words, thank, breather, promise, wont, take, help, granted, fulfil, prom...","[ive, search, right, word, thank, breather, promis, wont, take, help, grant, fulfil, promis, won..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,..."
2,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, goe, usf, live, around, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,"[even, brother, like, speak, treat, like, aids, patent]","[even, brother, like, speak, treat, like, aid, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,"[date, sunday]","[date, sunday]"
