In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [None]:
text = "Natural language processing (NLP) is a field of science"

In [3]:
sent_tokenize(text)

['Natural language processing (NLP) is a field of science']

In [4]:
word_tokenize(text)

['Natural',
 'language',
 'processing',
 '(',
 'NLP',
 ')',
 'is',
 'a',
 'field',
 'of',
 'science']

In [8]:
text = "Natural language processing (NLP) is a field of computer science. Artificial intelligence and computational linguistics are also related with NLP."
text

'Natural language processing (NLP) is a field of computer science. Artificial intelligence and computational linguistics are also related with NLP.'

In [11]:
print(len(sent_tokenize(text)))
sent_tokenize(text)

2


['Natural language processing (NLP) is a field of computer science.',
 'Artificial intelligence and computational linguistics are also related with NLP.']

In [12]:
print(len(word_tokenize(text)))
word_tokenize(text)

24


['Natural',
 'language',
 'processing',
 '(',
 'NLP',
 ')',
 'is',
 'a',
 'field',
 'of',
 'computer',
 'science',
 '.',
 'Artificial',
 'intelligence',
 'and',
 'computational',
 'linguistics',
 'are',
 'also',
 'related',
 'with',
 'NLP',
 '.']

In [17]:
text = "NLP's of the world can't be decoded"

In [18]:
word_tokenize(text)

['NLP', "'s", 'of', 'the', 'world', 'ca', "n't", 'be', 'decoded']

In [19]:
from nltk.tokenize import TreebankWordTokenizer 
tokenizer = TreebankWordTokenizer() 
tokenizer.tokenize(text)

['NLP', "'s", 'of', 'the', 'world', 'ca', "n't", 'be', 'decoded']

In [20]:
from nltk.tokenize import WordPunctTokenizer 
tokenizer = WordPunctTokenizer() 
tokenizer.tokenize(text)

['NLP', "'", 's', 'of', 'the', 'world', 'can', "'", 't', 'be', 'decoded']

In [21]:
from nltk.tokenize import RegexpTokenizer 
tokenizer = RegexpTokenizer("[\w']+") 
tokenizer.tokenize(text) 

["NLP's", 'of', 'the', 'world', "can't", 'be', 'decoded']

In [26]:
text.split(' ')

["NLP's", 'of', 'the', 'world', "can't", 'be', 'decoded']

In [1]:
# Use Spacy

In [2]:
# Import spaCy and load the language library
import spacy
nlp = spacy.load('en_core_web_sm')



    Only loading the 'en' tokenizer.



In [3]:
mystring = '"We\'re moving to L.A.!"'
print(mystring)

"We're moving to L.A.!"


In [4]:
doc = nlp(mystring)
for token in doc:
    print(token)

"
We
're
moving
to
L.A.
!
"


# Kera’s text_to_word_sequence

In [6]:
from keras.preprocessing.text import text_to_word_sequence

In [8]:
tokens = text_to_word_sequence(mystring)
tokens

["we're", 'moving', 'to', 'l', 'a']

# ![image.png](attachment:image.png)

# STEMMING

    Stemming is the process of reducing inflection in words to their root forms such as mapping a group of words to the same stem even if the stem itself is not a valid word in the Language.

In [21]:
# Import the toolkit and the full Porter Stemmer library
import nltk
from nltk.stem.porter import *
p_stemmer = PorterStemmer()
words = ['run','runner','running','ran','runs','easily','fairly']
for word in words:
    print(word+' --> '+p_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fairli


In [24]:
from nltk.stem.snowball import SnowballStemmer
# The Snowball Stemmer requires that you pass a language parameter
s_stemmer = SnowballStemmer(language='english')
words = ['run','runner','running','ran','runs','easily','fairly', 'case studies']
for word in words:
    print(word+' --> '+s_stemmer.stem(word))

run --> run
runner --> runner
running --> run
ran --> ran
runs --> run
easily --> easili
fairly --> fair
case studies --> case studi


In [25]:
words = ['consolingly']

In [26]:
print('Porter Stemmer:')
for word in words:
    print(word+' --> '+p_stemmer.stem(word))

print('Porter2 Stemmer:')
for word in words:
    print(word+' --> '+s_stemmer.stem(word))

Porter Stemmer:
consolingly --> consolingli
Porter2 Stemmer:
consolingly --> consol


# LEMMATIZATION

In [34]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\smattoo5\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [35]:
import nltk
from nltk.stem import WordNetLemmatizer

In [36]:
# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()
# Lemmatize Single Word
print(lemmatizer.lemmatize("bats"))

bat


In [38]:
words = ['run','runner','running','ran','runs','easily','fairly', 'case studies', 'consolingly ']
for word in words:
    print(word+' --> '+lemmatizer.lemmatize(word))

run --> run
runner --> runner
running --> running
ran --> ran
runs --> run
easily --> easily
fairly --> fairly
case studies --> case studies
consolingly  --> consolingly 


In [39]:
import spacy
# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
nlp = spacy.load('en', disable=['parser', 'ner'])
sentence = "The striped bats are hanging on their feet for best"
# Parse the sentence using the loaded 'en' model object `nlp`
doc = nlp(sentence)
# Extract the lemma for each token and join
" ".join([token.lemma_ for token in doc])

'the striped bat be hang on -PRON- foot for good'

In [40]:
from textblob import TextBlob, Word
# Lemmatize a word
word = 'stripes'
w = Word(word)
w.lemmatize()

'stripe'

In [42]:
words = ['run','runner','running','ran','runs','easily','fairly', 'case studies', 'consolingly ']
for word in words:
    w= Word(word)
    print(word+' --> '+w.lemmatize())

run --> run
runner --> runner
running --> running
ran --> ran
runs --> run
easily --> easily
fairly --> fairly
case studies --> case studies
consolingly  --> consolingly 
