# 1.Basic methods of Natural Language Processing(NLP)

# Tokenization, Lemmatization and Stop Word Removal 

In [12]:
# importing the libraries
import pandas as pd
import nltk
import sklearn
import gzip

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('reviews_Cell_Phones_and_Accessories_5.json.gz')

# take 5 head of data as example
see = df.head(5)
print(see)

       reviewerID        asin      reviewerName helpful  \
0  A30TL5EWN6DFXT  120401325X         christina  [0, 0]   
1   ASY55RVNIL0UD  120401325X          emily l.  [0, 0]   
2  A2TMXE2AFO7ONB  120401325X             Erica  [0, 0]   
3   AWJ0WZQYMYFQ4  120401325X                JM  [4, 4]   
4   ATX7CZYFXI1KW  120401325X  patrice m rogoza  [2, 3]   

                                          reviewText  overall  \
0  They look good and stick good! I just don't li...      4.0   
1  These stickers work like the review says they ...      5.0   
2  These are awesome and make my phone look so st...      5.0   
3  Item arrived in great time and was in perfect ...      4.0   
4  awesome! stays on, and looks great. can be use...      5.0   

                                     summary  unixReviewTime   reviewTime  
0                                 Looks Good      1400630400  05 21, 2014  
1                      Really great product.      1389657600  01 14, 2014  
2                         

In [13]:
# print the the column "reviewText"
see[["reviewText"]]

Unnamed: 0,reviewText
0,They look good and stick good! I just don't li...
1,These stickers work like the review says they ...
2,These are awesome and make my phone look so st...
3,Item arrived in great time and was in perfect ...
4,"awesome! stays on, and looks great. can be use..."


In [14]:
# take row No.0 as example
ls_txt = see["reviewText"][0]
ls_txt

"They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I just won't buy a product like this again"

In [None]:
# Basic text analysis has the following processes
# 1: Tokenization
# 2: Lemmatization
# 3: Stop Word Removal

In [26]:
# Tokenization
# It is the process of breaking text into smaller chunks/tokens

# call an instance of white space tokenizer
tokenizer = nltk.tokenize.WhitespaceTokenizer()
tokens = tokenizer.tokenize(ls_txt)
print(tokens)
len(tokens)

['They', 'look', 'good', 'and', 'stick', 'good!', 'I', 'just', "don't", 'like', 'the', 'rounded', 'shape', 'because', 'I', 'was', 'always', 'bumping', 'it', 'and', 'Siri', 'kept', 'popping', 'up', 'and', 'it', 'was', 'irritating.', 'I', 'just', "won't", 'buy', 'a', 'product', 'like', 'this', 'again']


37

In [27]:
# There can be instances where we need to tokenize using white space as well as punctuation. 
# This can be done using TreebankWord Tokenizer. Simply put, don't word will be broken into 'do' and 'n't'

tokenize=nltk.tokenize.TreebankWordTokenizer()
tokens = tokenize.tokenize(ls_txt)
print(tokens)
len(tokens)

['They', 'look', 'good', 'and', 'stick', 'good', '!', 'I', 'just', 'do', "n't", 'like', 'the', 'rounded', 'shape', 'because', 'I', 'was', 'always', 'bumping', 'it', 'and', 'Siri', 'kept', 'popping', 'up', 'and', 'it', 'was', 'irritating.', 'I', 'just', 'wo', "n't", 'buy', 'a', 'product', 'like', 'this', 'again']


40

In [28]:
#WorkPunct tokenizer creates , . ' and " as individual Tokens

tokenizer=nltk.tokenize.WordPunctTokenizer()
tokens = tokenizer.tokenize(ls_txt)
print(tokens)
len(tokens)

['They', 'look', 'good', 'and', 'stick', 'good', '!', 'I', 'just', 'don', "'", 't', 'like', 'the', 'rounded', 'shape', 'because', 'I', 'was', 'always', 'bumping', 'it', 'and', 'Siri', 'kept', 'popping', 'up', 'and', 'it', 'was', 'irritating', '.', 'I', 'just', 'won', "'", 't', 'buy', 'a', 'product', 'like', 'this', 'again']


43

In [None]:
# After breaking the text into individual words (tokens), we want to ensure that each of them are converted into their base form. 
# This base form is called lemma. This can be done using Lemmatization and Stemming. The key difference between them is mentioned below.

# Stemming: It converts different forms of the word by chopping off the suffixes.It normally results in ambiguous words with no meanings.

# Lemmatization: It converts different forms of the same word by considering the  morphological  context of the text. 
# For instance study and studying are treated in the same way and are converted to 'study'

In [71]:
# Lemmatization
# Next we want to ensure that tokens in the text are normalised
# Meaning study and studying are treated in the same way
# This can be achieved by using either stemming or lemmatization

#Stemming:Chops off suffixes.Uses Porter Stemming method.Disadvantage is that is results in non-words
#Lemmatization refers to doing things keeping usage and morphology in mind
#It returns the base or dictionary form of the word which is known as lemma
#For lemmatization we use WordNetLemmatizer found in NLTK library

# Using 'Stemming' example
tokenizer= nltk.tokenize.TreebankWordTokenizer()
tokens=tokenizer.tokenize(ls_txt)
print(tokens)
len(tokens)

['They', 'look', 'good', 'and', 'stick', 'good', '!', 'I', 'just', 'do', "n't", 'like', 'the', 'rounded', 'shape', 'because', 'I', 'was', 'always', 'bumping', 'it', 'and', 'Siri', 'kept', 'popping', 'up', 'and', 'it', 'was', 'irritating.', 'I', 'just', 'wo', "n't", 'buy', 'a', 'product', 'like', 'this', 'again']


40

# use Stemming

In [63]:
stemmer = nltk.stem.PorterStemmer()
" ".join(stemmer.stem(i) for i in tokens)

"they look good and stick good ! I just do n't like the round shape becaus I wa alway bump it and siri kept pop up and it wa irritating. I just wo n't buy a product like thi again"

In [64]:
# We can see the following things
# rounded has been converted to round
# bumping has been converted to bump
# In general sses forms get converted to ss: Example caresses-->caress
# ies --> i : Example studies-->studi
# s-->singular form: Example cats-->cat

# use Lemmatizer

In [65]:
#Let's use Lemmatizer and compare the outputs
# nltk.download('wordnet')
# print(ls_txt)
lemma = nltk.stem.WordNetLemmatizer()
" ".join(lemma.lemmatize(i,'v') for i in tokens)
# Here we can see that an extra argument in the form of 'v'(verb) has been provided
# This is to explicitly specify the Part of Speech(POS)
# If we dont specify the POS then WordNetLemmatizer assumes everything to be noun and hence 
# Inflectional various forms of the same root word will return different results

"They look good and stick good ! I just do n't like the round shape because I be always bump it and Siri keep pop up and it be irritating. I just wo n't buy a product like this again"

# Explaining the extra argument 'v' in WordNetLemmatizer()

In [66]:
# Not supplying POS
" ".join(lemma.lemmatize(i) for i in ['run','running','ran'])
# As can be seen that it in absence of POS, it doesnt do lemmatization properly

'run running ran'

In [67]:
# Supplying POS
" ".join(lemma.lemmatize(i,'v') for i in ['run','running','ran'])

'run run run'

# StopWord Removal

In [68]:
#  After breaking and normalizing the text, we need to also ensure that there are no articles, determiners, prepositions etc. 
# In most situations it is necessary to remove them as they don't add any meaning  to the analysis. 
# The process of removing them is known as Stop word removal. 
# There is a custom list in python that contains list of words which can be used for cleaning text from any stop word. 
# Moreover, we can create our own stop word list and use it to clean the text as shown below:

In [84]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Drunkbear\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [93]:
# StopWord Removal
# In sentences there are often words that dont provide any additional information
# These are a,is,the,etc
# Before any text analysis, these have to be removed as well

stop = set(stopwords.words('english'))# This is a set
print(stop, '\n')
# Lemmatization
sentnc=" ".join(lemma.lemmatize(i,'v') for i in tokens)
print(sentnc, '\n')
# Removing Stop word
clen_sentnc=[x for x in sentnc.lower().split() if x not in stop]
print(clen_sentnc)
len(clen_sentnc)

{'his', 'themselves', 'theirs', "shouldn't", 'mustn', 'under', 'aren', 'ours', "mustn't", "should've", 'at', 'did', 'as', 'from', 'yours', 'who', 'when', 'a', 'she', 'by', 'more', 'won', 'again', 'once', 'all', 'very', 'will', 'hers', 'on', 'you', 'are', 'have', 'hadn', 'during', 'between', 'wouldn', 'wasn', 'after', 'both', 'where', 'your', "don't", 'until', 'too', 're', 'y', 'haven', 'here', 'don', 'such', 'd', 'having', "that'll", "hasn't", "weren't", 'while', 'him', 'because', 'he', 'am', 'is', 'be', 'i', 'which', 'didn', 'the', 'down', 'out', "aren't", "needn't", 'had', 'same', "couldn't", 'o', 'ma', 'm', 'shouldn', 'to', 'most', "won't", 'over', 's', 'were', 'no', 'should', 'can', "hadn't", 'my', 'that', 'further', 'was', 'each', 'me', 'these', 'ain', 'but', 'being', 'has', 'nor', 'with', "mightn't", 'than', 'how', 'myself', 'shan', 'through', 'some', 'before', "wasn't", 'off', 'any', 'himself', 'it', 'them', 'couldn', 'above', 'hasn', 'we', 'there', "you've", 'been', 'doesn', 't

20

In [94]:
# Custom stop word list can also be created to remove additional words
# For isntance, lets say word 'wo' has to be removed from clen_sentnc
# Updating the 'stop' list
stop_new=list(stop)
stop_new.append('wo')

#Checking if 'get' has been added to the new stop list
stop_new[len(stop_new)-1]

'wo'

In [97]:
# Removing 'wo' from the list
ls_new = [x for x in clen_sentnc if x not in stop_new ]
print(ls_new)

['look', 'good', 'stick', 'good', '!', "n't", 'like', 'round', 'shape', 'always', 'bump', 'siri', 'keep', 'pop', 'irritating.', "n't", 'buy', 'product', 'like']


# Bag of Words using Term Frequency (TF and ngrams)

In [98]:
# Lets look at the number of times a word appears in a list
# For calculating the word frequency we will use ngrams,FreqDist methods from nltk
from nltk import ngrams, FreqDist

s1 = pd.Series(ls_new)
print(s1.value_counts(normalize=False))

n't            2
like           2
good           2
bump           1
always         1
round          1
irritating.    1
look           1
keep           1
shape          1
stick          1
pop            1
buy            1
!              1
product        1
siri           1
dtype: int64


# (to be continued................)