# NLP Basics: Reading in text data & why do we need to clean the text?

### Read in semi-structured text data

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Read in the raw text
rawData = open("/content/drive/MyDrive/NLP/Ch01/01_03/Start/SMSSpamCollection.tsv").read()

# Print the raw data
rawData[0:1000]

"ham\tI've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.\nspam\tFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\nham\tNah I don't think he goes to usf, he lives around here though\nham\tEven my brother is not like to speak with me. They treat me like aids patent.\nham\tI HAVE A DATE ON SUNDAY WITH WILL!!\nham\tAs per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune\nspam\tWINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.\nspam\tHad your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera fo

In [5]:
print(len(rawData))

476846


In [6]:
parsedData = rawData.replace('\t', '\n').split('\n')

In [8]:
parsedData[0:5]

['ham',
 "I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.",
 'spam',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'ham']

In [10]:
labelList = parsedData[0::2]
textList = parsedData[1::2]

In [12]:
print(labelList[0:5])
print(textList[0:5])

['ham', 'spam', 'ham', 'ham', 'ham']
["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.", "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", "Nah I don't think he goes to usf, he lives around here though", 'Even my brother is not like to speak with me. They treat me like aids patent.', 'I HAVE A DATE ON SUNDAY WITH WILL!!']


In [13]:
import pandas as pd

In [14]:
corpus = pd.DataFrame({'label':labelList, 'bodylist': textList})

ValueError: ignored

In [15]:
print(len(labelList))

5571


In [16]:
print(len(textList))

5570


In [18]:
print(labelList[-5:])

['ham', 'ham', 'ham', 'ham', '']


In [19]:
corpus = pd.DataFrame({'label':labelList[:-1], 'bodylist': textList})

In [20]:
len(corpus)

5570

In [21]:
corpus

Unnamed: 0,label,bodylist
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
...,...,...
5565,spam,This is the 2nd time we have tried 2 contact u...
5566,ham,Will ü b going to esplanade fr home?
5567,ham,"Pity, * was in mood for that. So...any other s..."
5568,ham,The guy did some bitching but I acted like i'd...


In [22]:
dataset = pd.read_csv('/content/drive/MyDrive/NLP/Ch01/01_03/Start/SMSSpamCollection.tsv', sep="\t", header=None)

In [23]:
dataset.head()

Unnamed: 0,0,1
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [27]:
dataset.columns = ['label', 'body_text']

In [28]:
dataset.shape

(5568, 2)

In [31]:
dataset.isnull().sum()

label        0
body_text    0
dtype: int64

In [34]:
print("dataset has {} rows and {} columns". format(len(dataset), len(dataset.columns)))

dataset has 5568 rows and 2 columns


In [37]:
print("Out of {} rows, {} are spam, {} are ham".format(len(dataset), len(dataset[dataset['label']=='spam']), len(dataset[dataset['label']=='ham'])))

Out of 5568 rows, 746 are spam, 4822 are ham


In [41]:
print("Number of {} null in labels".format(dataset['label'].isnull().sum()))

Number of 0 null in labels


In [42]:
print("Number of {} null in bodytext".format(dataset['body_text'].isnull().sum()))

Number of 0 null in bodytext


# Regular Expression

In [43]:
import re

re_test = 'This is a made up string to test 2 different regex methods'
re_test_messy = 'This      is a made up     string to test 2    different regex methods'
re_test_messy1 = 'This-is-a-made/up.string*to>>>>test----2""""""different~regex-methods'

In [44]:
re.split('\s',re_test)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [45]:
re.split('\s',re_test_messy)

['This',
 '',
 '',
 '',
 '',
 '',
 'is',
 'a',
 'made',
 'up',
 '',
 '',
 '',
 '',
 'string',
 'to',
 'test',
 '2',
 '',
 '',
 '',
 'different',
 'regex',
 'methods']

In [50]:
re.split('\s+',re_test_messy)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [51]:
re.split('\s+',re_test_messy1)

['This-is-a-made/up.string*to>>>>test----2""""""different~regex-methods']

In [52]:
re.split('\W+',re_test_messy1)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [59]:
re.findall('\S+',re_test)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [55]:
re.findall('\S+',re_test_messy)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [56]:
re.findall('\S+',re_test_messy1)

['This-is-a-made/up.string*to>>>>test----2""""""different~regex-methods']

In [57]:
re.findall('\w+',re_test_messy1)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

## Regular Exp replacement

In [60]:
pep8_test = 'I try to follow PEP8 guidelines'
pep7_test = 'I try to follow PEP7 guidelines'
peep8_test = 'I try to follow PEEP8 guidelines'

In [62]:
re.findall('[a-z]+', pep8_test)

['try', 'to', 'follow', 'guidelines']

In [63]:
re.findall('[A-Z]+', pep8_test)

['I', 'PEP']

In [64]:
re.findall('[A-Z]+[0-9]+', pep8_test)

['PEP8']

In [68]:
re.sub('[A-Z]+[0-9]+', 'Peep 8 Python styleguide', peep8_test)

'I try to follow Peep 8 Python styleguide guidelines'

# Remove Punctuation

In [71]:
pd.set_option('display.max_colwidth', 100)
dataset = pd.read_csv('/content/drive/MyDrive/NLP/Ch01/01_03/Start/SMSSpamCollection.tsv', sep="\t", header=None)
dataset.head()

Unnamed: 0,0,1
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
2,ham,"Nah I don't think he goes to usf, he lives around here though"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [76]:
dataset.columns = ['label', 'body_text']

In [72]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [88]:
def remove_punc(text):
    text_nopun = "".join([char for char in text if char not in string.punctuation])
    return text_nopun




In [89]:
dataset['Clean_text'] = dataset['body_text'].apply(lambda x: remove_punc(x))
dataset.head()

Unnamed: 0,label,body_text,Clean_text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL


# Tokenization

In [90]:
import re

def tokenization(text):
    token = re.split("\W+", text)
    return token

In [91]:
dataset['tokenized_text'] = dataset['Clean_text']. apply(lambda x: tokenization(x.lower()))

In [92]:
dataset.head()

Unnamed: 0,label,body_text,Clean_text,tokenized_text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your...,"[ive, been, searching, for, the, right, words, to, thank, you, for, this, breather, i, promise, ..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to..."
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[i, have, a, date, on, sunday, with, will]"


# Remove stopwords

In [97]:
import nltk

nltk.download('stopwords')
stopword = nltk.corpus.stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [100]:
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopword]
    return text

dataset['text_nostop'] = dataset['tokenized_text'].apply(lambda x: remove_stopwords(x))

dataset.head()

Unnamed: 0,label,body_text,Clean_text,tokenized_text,text_nostop
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...,Ive been searching for the right words to thank you for this breather I promise i wont take your...,"[ive, been, searching, for, the, right, words, to, thank, you, for, this, breather, i, promise, ...","[ive, searching, right, words, thank, breather, promise, wont, take, help, granted, fulfil, prom..."
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
2,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.,Even my brother is not like to speak with me They treat me like aids patent,"[even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]","[even, brother, like, speak, treat, like, aids, patent]"
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,I HAVE A DATE ON SUNDAY WITH WILL,"[i, have, a, date, on, sunday, with, will]","[date, sunday]"
