In [34]:
# Importing the libraries

import nltk
import numpy as np
import sklearn
import pandas as pd

text="I don't like to study . All the concepts can't be learned through studying . I have studied all life but haven't got any success"
print(text)

I don't like to study . All the concepts can't be learned through studying . I have studied all life but haven't got any success


In [3]:
# Any basic text analysis has the following processes
# 1: Tokenization
# 2:Lemmatization
# 3:Stop Word Removal

In [35]:
# Tokenization
# It is the process of breaking text into smaller chunks/tokens 

#Let call an instance of white space tonkenizer

tokenizer=nltk.tokenize.WhitespaceTokenizer()
tokenizer.tokenize(text)

['I',
 "don't",
 'like',
 'to',
 'study',
 '.',
 'All',
 'the',
 'concepts',
 "can't",
 'be',
 'learned',
 'through',
 'studying',
 '.',
 'I',
 'have',
 'studied',
 'all',
 'life',
 'but',
 "haven't",
 'got',
 'any',
 'success']

In [36]:
#TreebankWord tokenizer retains ' and " which is sometimes more meaningful

tokenize=nltk.tokenize.TreebankWordTokenizer()
tokenize.tokenize(text)

['I',
 'do',
 "n't",
 'like',
 'to',
 'study',
 '.',
 'All',
 'the',
 'concepts',
 'ca',
 "n't",
 'be',
 'learned',
 'through',
 'studying',
 '.',
 'I',
 'have',
 'studied',
 'all',
 'life',
 'but',
 'have',
 "n't",
 'got',
 'any',
 'success']

In [37]:
#WorkPunct tokenizer creates , . ' and " as individual Tokens

tokenizer=nltk.tokenize.WordPunctTokenizer()
tokenizer.tokenize(text)

['I',
 'don',
 "'",
 't',
 'like',
 'to',
 'study',
 '.',
 'All',
 'the',
 'concepts',
 'can',
 "'",
 't',
 'be',
 'learned',
 'through',
 'studying',
 '.',
 'I',
 'have',
 'studied',
 'all',
 'life',
 'but',
 'haven',
 "'",
 't',
 'got',
 'any',
 'success']

In [17]:
# Lemmatization
# Next we want to ensure that tokens in the text are normalised
# Meaning study and studying are treated in the same way
# This can be achieved by using either stemming or lemmatization

#Stemming:Chops off suffixes.Uses Porter Stemming method.Disadvantage is that is results in non-words
#Lemmatization refers to doing things keeping usage and morphology in mind
#It returns the base or dictionary form of the word which is known as lemma
#For lemmatization we use WordNetLemmatizer found in NLTK library

In [38]:
# Stemming example
#Lets first tokenize this

tokenizer= nltk.tokenize.TreebankWordTokenizer()
tokens=tokenizer.tokenize(text)
print(tokens)


['I', 'do', "n't", 'like', 'to', 'study', '.', 'All', 'the', 'concepts', 'ca', "n't", 'be', 'learned', 'through', 'studying', '.', 'I', 'have', 'studied', 'all', 'life', 'but', 'have', "n't", 'got', 'any', 'success']


In [39]:
#Let's use Stemming

stemmer = nltk.stem.PorterStemmer()
" ".join(stemmer.stem(i) for i in tokens)

"I do n't like to studi . all the concept ca n't be learn through studi . I have studi all life but have n't got ani success"

In [22]:
# We can see the following things
# learned has been converted to lean
# studying has been converted to studyi
# In general sses forms get converted to ss: Example caresses-->caress
# ies --> i : Example studies-->studi
# s-->singular form: Example cats-->cat

In [41]:
#Let's use Lemmatizer and compare the outputs
lemma = nltk.stem.WordNetLemmatizer()
" ".join(lemma.lemmatize(i,'v') for i in tokens)
# Here we can see that an extra argument in the form of 'v'(verb) has been provided
# This is to explicitly specify the Part of Speech(POS)
# If we dont specify the POS then WordNetLemmatizer assumes everything to be noun and hence 
# Inflectional various forms of the same root word will return different results

"I do n't like to study . All the concepts ca n't be learn through study . I have study all life but have n't get any success"

In [45]:
# Not supplying POS
" ".join(lemma.lemmatize(i) for i in ['run','running','ran'])
# As can be seen that it in absence of POS, it doesnt do lemmatization properly

'run running ran'

In [47]:
# Supplying POS
" ".join(lemma.lemmatize(i,'v') for i in ['run','running','ran'])

'run run run'

In [52]:
# StopWord Removal
# In sentences there are often words that dont provide any additional information
# These are a,is,the,etc
# Before any text analysis, these have to be removed as well

from nltk.corpus import stopwords
stop = set(stopwords.words('english'))# This is a set

# Lemmatization
sentnc=" ".join(lemma.lemmatize(i,'v') for i in tokens)

# Removing Stop word
clen_sentnc=[x for x in sentnc.lower().split() if x not in stop]
print(clen_sentnc)

["n't", 'like', 'study', '.', 'concepts', 'ca', "n't", 'learn', 'study', '.', 'study', 'life', "n't", 'get', 'success']


In [96]:
# Custom stop word list can also be created to remove additional words
# For isntance, lets say word 'get' has to be removed from clen_sentnc
# Updating the 'stop' list
stop_new=list(stop)
stop_new.append('get')

#Checking if 'get' has been added to the new stop list
stop_new[len(stop_new)-1]

'get'

In [102]:
# Removing 'get' from the list
[x for x in clen_sentnc if x not in stop_new ]

["n't",
 'like',
 'study',
 '.',
 'concepts',
 'ca',
 "n't",
 'learn',
 'study',
 '.',
 'study',
 'life',
 "n't",
 'success']