#  Text Preprocessing

In [4]:
# download amazon reviews dataset from kaggle
import pandas as pd

df = pd.read_csv("./data/amazon_reviews.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,reviewerName,overall,reviewText,reviewTime,day_diff,helpful_yes,helpful_no,total_vote,score_pos_neg_diff,score_average_rating,wilson_lower_bound
0,0,,4.0,No issues.,2014-07-23,138,0,0,0,0,0.0,0.0
1,1,0mie,5.0,"Purchased this for my device, it worked as adv...",2013-10-25,409,0,0,0,0,0.0,0.0
2,2,1K3,4.0,it works as expected. I should have sprung for...,2012-12-23,715,0,0,0,0,0.0,0.0
3,3,1m2,5.0,This think has worked out great.Had a diff. br...,2013-11-21,382,0,0,0,0,0.0,0.0
4,4,2&amp;1/2Men,5.0,"Bought it with Retail Packaging, arrived legit...",2013-07-13,513,0,0,0,0,0.0,0.0


#### Lowercasing
- uniformity
- no extra complexity due to uppercase and lowercase of same words

In [6]:
df['reviewText'][3].lower()

"this think has worked out great.had a diff. bran 64gb card and if went south after 3 months.this one has held up pretty well since i had my s3, now on my note3.*** update 3/21/14i've had this for a few months and have had zero issue's since it was transferred from my s3 to my note3 and into a note2. this card is reliable and solid!cheers!"

In [None]:
df['review'] = df['review'].str.lower()

#### HTML Tags Removal
- in case of web scraped data, html tags are present
- not required for many applications in NLP

In [7]:
sample_text = """<a href="/wiki/Wikipedia:Purpose" title="Wikipedia:Purpose">Wikipedia's purpose</a> is to benefit readers by presenting information on all branches of <a href="/wiki/Knowledge" title="Knowledge">knowledge</a>. 
Hosted by the <a href="/wiki/Wikipedia:Wikimedia_Foundation" title="Wikipedia:Wikimedia Foundation">Wikimedia Foundation</a>, 
it consists of <a href="/wiki/Help:Editing" title="Help:Editing">freely editable</a> content, whose articles also have 
numerous links to guide readers towards more information."""

In [9]:
import re

def remove_html(text):
    return re.sub('<.*?>', '', text)

remove_html(sample_text)

"Wikipedia's purpose is to benefit readers by presenting information on all branches of knowledge. \nHosted by the Wikimedia Foundation, \nit consists of freely editable content, whose articles also have \nnumerous links to guide readers towards more information."

In [None]:
df['review'] = df['review'].apply(removehtml) # apply on whole column

#### Renove URLs

In [13]:
def remove_url(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

remove_url("https://www.dummyewebsite.com is just a fake website name")

' is just a fake website name'

#### Remove Punctuations
- extra tokens of punctuations or sometimes punctuations are also part of tokenized words thus adding extra words
- most of the time punctuations are not required

In [21]:
%%time 

# slow execution
import string, time
exclude = string.punctuation

def remove_punc(text):
    for char in exclude:
        text = text.replace(char, '')
    return text

df['reviewText'] = df['reviewText'].astype(str).apply(remove_punc)

CPU times: user 53.1 ms, sys: 0 ns, total: 53.1 ms
Wall time: 51.8 ms


In [22]:
%%time

# faster
def remove_punc_fast(text):
    return text.translate(str.maketrans('', '', exclude))

df['reviewText'] = df['reviewText'].astype(str).apply(remove_punc)

CPU times: user 39.1 ms, sys: 2.9 ms, total: 42 ms
Wall time: 40.6 ms


#### Chat Word Treatment
- Get list of short forms and their expansions
- create a dict of slang and its expansion
- replace short slang with its expansion 

In [26]:
# create dict of slang
chat_words = """AFAIK=As Far As I Know
AFK=Away From Keyboard
ASAP=As Soon As Possible
ATK=At The Keyboard
ATM=At The Moment
A3=Anytime, Anywhere, Anyplace
BAK=Back At Keyboard
BBL=Be Back Later
BBS=Be Back Soon
BFN=Bye For Now
B4N=Bye For Now
BRB=Be Right Back
BRT=Be Right There
BTW=By The Way
B4=Before
B4N=Bye For Now
CU=See You
CUL8R=See You Later
CYA=See You
FAQ=Frequently Asked Questions
FC=Fingers Crossed
FWIW=For What It's Worth
FYI=For Your Information
GAL=Get A Life
GG=Good Game
GN=Good Night
GMTA=Great Minds Think Alike
GR8=Great!
G9=Genius
IC=I See
ICQ=I Seek you (also a chat program)
ILU=ILU: I Love You
IMHO=In My Honest/Humble Opinion
IMO=In My Opinion
IOW=In Other Words
IRL=In Real Life
KISS=Keep It Simple, Stupid
LDR=Long Distance Relationship
LMAO=Laugh My A.. Off
LOL=Laughing Out Loud
LTNS=Long Time No See
L8R=Later
MTE=My Thoughts Exactly
M8=Mate
NRN=No Reply Necessary
OIC=Oh I See
PITA=Pain In The A..
PRT=Party
PRW=Parents Are Watching
ROFL=Rolling On The Floor Laughing
ROFLOL=Rolling On The Floor Laughing Out Loud
ROTFLMAO=Rolling On The Floor Laughing My A.. Off
SK8=Skate
STATS=Your sex and age
ASL=Age, Sex, Location
THX=Thank You
TTFN=Ta-Ta For Now!
TTYL=Talk To You Later
U=You
U2=You Too
U4E=Yours For Ever
WB=Welcome Back
WTF=What The F...
WTG=Way To Go!
WUF=Where Are You From?
W8=Wait...
7K=Sick:-D Laugher
TFW = That feeling when. TFW internet slang often goes in a caption to an image.
MFW = My face when
MRW = My reaction when
IFYP = I feel your pain
LOL = Laughing out loud
TNTL = Trying not to laugh
JK = Just kidding
IDC = I don’t care
ILY = I love you
IMU = I miss you
ADIH = Another day in hell
IDC = I don’t care
ZZZ = Sleeping, bored, tired
WYWH = Wish you were here
TIME = Tears in my eyes
BAE = Before anyone else
FIMH = Forever in my heart
BSAAW = Big smile and a wink
BWL = Bursting with laughter
LMAO = Laughing my a** off
BFF = Best friends forever
CSL = Can’t stop laughing"""

word_pairs = chat_words.split("\n")

chat_dict = {}
for pair in word_pairs:
    key, value = pair.split('=')
    chat_dict[key] = value
# chat_dict

In [30]:
def chat_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_dict:
            new_text.append(chat_dict[w.upper()])
        else:
            new_text.append(w)
    return ' '.join(new_text)

chat_conversion("rofl how this happened with you, lol !")

'Rolling On The Floor Laughing how this happened with you, Laughing Out Loud !'

#### Spelling Correction

In [31]:
from textblob import TextBlob

def correct_spell(text):
    text_blob = TextBlob(text)
    return text_blob.correct().string

correct_spell('certain conditionas duriing severall ggeneration')

'certain conditions during several generation'

#### Remove stop words
- nltk has list of stop words
- stop words **need not be removed for POS tagging**

In [33]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/op/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [41]:
from nltk.corpus import stopwords
stopwords.words('english')

def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return ' '.join(x)

remove_stopwords(df['reviewText'][5])

'Its mini storage It doesnt  anything else    supposed  I purchased   add additional storage   Microsoft Surface Pro tablet   come  64  128 GB It    supposed   SanDisk   long standing reputation  speaks  '

#### Handling Emoji

Two approaches to handle based on requirement:
- remove emoji
- replace with their name word

In [46]:
# remove
import re

def remove_emoji(text):
    emoji_pattern = re.compile("["
                              u"\U0001F600-\U0001F64F"   #emoticons
                              u"\U0001F300-\U0001F5FF"   #symbols and pictographs
                              u"\U0001F680-\U0001F6FF"   #transport and map symbols
                              u"\U00012702-\U000127B0"   #flags
                              u"\U000124C2-\U0001F251"
                              "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

remove_emoji('Python is 🔥')

'Python is '

In [None]:
!pip install emoji

In [43]:
# replace
import emoji

print(emoji.demojize('Python is 🔥'))

Python is :fire:


### Tokenisation

##### basic using split

In [48]:
paragraph = """Consistency is a crucial aspect of success in many areas of life, including personal growth, professional development, and relationships. Consistency means performing actions and making decisions in a predictable and reliable manner, over time and across different situations. By being consistent, we build trust and credibility with others, and we also create a sense of stability and security for ourselves."""

sentences = paragraph.split('.')

words = [sentence.split() for sentence in sentences]
print(words)

[['Consistency', 'is', 'a', 'crucial', 'aspect', 'of', 'success', 'in', 'many', 'areas', 'of', 'life,', 'including', 'personal', 'growth,', 'professional', 'development,', 'and', 'relationships'], ['Consistency', 'means', 'performing', 'actions', 'and', 'making', 'decisions', 'in', 'a', 'predictable', 'and', 'reliable', 'manner,', 'over', 'time', 'and', 'across', 'different', 'situations'], ['By', 'being', 'consistent,', 'we', 'build', 'trust', 'and', 'credibility', 'with', 'others,', 'and', 'we', 'also', 'create', 'a', 'sense', 'of', 'stability', 'and', 'security', 'for', 'ourselves'], []]


##### using regEx

In [50]:
sentences = re.compile('[ .!?]').split(paragraph)
print(sentences)

['Consistency', 'is', 'a', 'crucial', 'aspect', 'of', 'success', 'in', 'many', 'areas', 'of', 'life,', 'including', 'personal', 'growth,', 'professional', 'development,', 'and', 'relationships', '', 'Consistency', 'means', 'performing', 'actions', 'and', 'making', 'decisions', 'in', 'a', 'predictable', 'and', 'reliable', 'manner,', 'over', 'time', 'and', 'across', 'different', 'situations', '', 'By', 'being', 'consistent,', 'we', 'build', 'trust', 'and', 'credibility', 'with', 'others,', 'and', 'we', 'also', 'create', 'a', 'sense', 'of', 'stability', 'and', 'security', 'for', 'ourselves', '']


##### using NLTK

In [52]:
from nltk.tokenize import word_tokenize, sent_tokenize

para1 = """Consistency also helps us to build habits and routines that support our well-being and success. By consistently practicing healthy habits, such as exercise, meditation, and healthy eating, we can improve our physical and mental health and increase our productivity and creativity."""

sentences = sent_tokenize(para1)
print(sentences)

for sent in sentences:
    print(word_tokenize(sent))

['Consistency also helps us to build habits and routines that support our well-being and success.', 'By consistently practicing healthy habits, such as exercise, meditation, and healthy eating, we can improve our physical and mental health and increase our productivity and creativity.']
['Consistency', 'also', 'helps', 'us', 'to', 'build', 'habits', 'and', 'routines', 'that', 'support', 'our', 'well-being', 'and', 'success', '.']
['By', 'consistently', 'practicing', 'healthy', 'habits', ',', 'such', 'as', 'exercise', ',', 'meditation', ',', 'and', 'healthy', 'eating', ',', 'we', 'can', 'improve', 'our', 'physical', 'and', 'mental', 'health', 'and', 'increase', 'our', 'productivity', 'and', 'creativity', '.']


##### using spaCy

In [None]:
!pip install spacy

In [None]:
!python3 -m spacy download en_core_web_sm

In [59]:
import spacy

nlp = spacy.load('en_core_web_sm')

sent1 = "Natural Language Processing (NLP) is a field of computer science that focuses on the interaction between computers and human language."
sent2 = "Python is a popular programming language for NLP, with libraries such as NLTK, Spacy, and Gensim."
sent3 = "NLP tasks include text classification, sentiment analysis, and named entity recognition."
sent4 = "In NLP, text data is preprocessed using techniques such as tokenization, stopword removal, and stemming."

doc1 = nlp(sent1)
doc2 = nlp(sent2)
doc3 = nlp(sent3)
doc4 = nlp(sent4)

for token in doc2:
    print(token)

Python
is
a
popular
programming
language
for
NLP
,
with
libraries
such
as
NLTK
,
Spacy
,
and
Gensim
.


## Stemming and Lemmatization

##### Inflection
- Modification of a word to express diferent grammaticalcategories such as tense case, voice, aspect, person, number, gender and mood
    - Ex: walk, walking, walked

#### Stemming
- Reducing inflection in words to their root forms such mapping group of words to the same stem even if **the stem itself may not be a valid word in the language**.
- extracting root words
- used in Information Retrieval systems - search engines

- Different stemming algos:
    - **Porter Stemmer**:
        - Used in English text
    - **Snowball Stemmer**:
        - Used in other languages

##### Porter Stemmer

In [62]:
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

sample = "Walk walked walking undoable"

stem_words(sample)

'walk walk walk undoabl'

#### Lemmatization
- Same as stemming but the stem is always a valid language word
- Slow process
- If user is to be shown output, consider lemmatization else stemming
- Root word is called 'lemma'. **Lemma** is the canonical form, dict form or citation form of a set of words.
- works by searching in dictionary and finding the exact lemma.

In [64]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/op/nltk_data...


True

In [69]:
import nltk
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

sentence = "He was running for 5 kilometers, he got tired and lied down on the ground for 20 minutes"
punctuations = "?.!,;"
sentence_words = nltk.word_tokenize(sentence)

for word in sentence_words:
    if word in punctuations:
        sentence_words.remove(word)

print("{0:20}{1:20}".format("Word", "Lemma"))
print("------"*5)
for word in sentence_words:
    print("{0:20}{1:20}".format(word, wordnet_lemmatizer.lemmatize(word, pos='v')))

Word                Lemma               
------------------------------
He                  He                  
was                 be                  
running             run                 
for                 for                 
5                   5                   
kilometers          kilometers          
he                  he                  
got                 get                 
tired               tire                
and                 and                 
lied                lie                 
down                down                
on                  on                  
the                 the                 
ground              grind               
for                 for                 
20                  20                  
minutes             minutes             
