In [1]:
corpus = 'Consider a scenario where we’re working with a collection of social media posts to detect news events. Social media text is very different from the language we’d see in, say, newspapers. A word can be spelled in different ways, including in shortened forms, a phone number can be written in different formats (e.g., with and without hyphens), names are sometimes in lowercase, and so on. When we’re working on developing NLP tools to work with such data, it’s useful to reach a canonical representation of text that captures all these variations into one representation. This is known as text normalization. Some common steps for text normalization are to convert all text to lowercase or uppercase, convert digits to text (e.g., 9 to nine), expand abbreviations, and so on. A simple way to incorporate text normalization can be found in Spacy’s source code [35], which is a dictionary showing different spellings of a preset collection of words mapped to a single spelling. We’ll see more examples of text normalization in Chapter 8.'


In [2]:
corpus_lower = corpus.lower()
print(corpus_lower)

import re
corpus_lower = re.sub(r'\d+','', corpus_lower)
import string
corpus_lower = corpus_lower.translate(str.maketrans('', '', string.punctuation))


consider a scenario where we’re working with a collection of social media posts to detect news events. social media text is very different from the language we’d see in, say, newspapers. a word can be spelled in different ways, including in shortened forms, a phone number can be written in different formats (e.g., with and without hyphens), names are sometimes in lowercase, and so on. when we’re working on developing nlp tools to work with such data, it’s useful to reach a canonical representation of text that captures all these variations into one representation. this is known as text normalization. some common steps for text normalization are to convert all text to lowercase or uppercase, convert digits to text (e.g., 9 to nine), expand abbreviations, and so on. a simple way to incorporate text normalization can be found in spacy’s source code [35], which is a dictionary showing different spellings of a preset collection of words mapped to a single spelling. we’ll see more examples o

In [3]:
from nltk import word_tokenize
import re
from nltk.corpus import stopwords
stop_words_nltk = set(stopwords.words('english'))

tokens = set(word_tokenize(corpus_lower))
print(tokens)

{'variations', 'as', 'posts', 'written', 'see', 'reach', 'digits', 'words', 'it', 'expand', 'media', 'formats', 'convert', 'all', 'with', 'events', 'newspapers', 'word', 'abbreviations', 'sometimes', 'language', 'be', 'normalization', 'that', 'd', 'useful', 'more', 'and', 'is', 'detect', 'these', 'found', 'steps', 'shortened', 'some', 'into', 'data', 'lowercase', 'way', 'collection', 'mapped', 'phone', 'names', 'working', 'simple', 'scenario', 'uppercase', 'spacy', 'which', 'captures', 'different', 'this', 'spellings', 'we', 'showing', 'say', 's', 'common', 'on', 'without', 'canonical', 'tools', 're', 'very', 'code', 'forms', 'single', 'spelled', 'when', 'such', 'dictionary', 'or', '’', 'can', 'eg', 'social', 'text', 'a', 'spelling', 'hyphens', 'examples', 'to', 'representation', 'so', 'number', 'from', 'consider', 'for', 'ways', 'incorporate', 'the', 'chapter', 'news', 'll', 'source', 'preset', 'work', 'known', 'including', 'nlp', 'where', 'one', 'nine', 'of', 'in', 'developing', 'are

In [4]:
tokens_no_stop = tokens - stop_words_nltk
print(tokens_no_stop)

{'showing', 'useful', 'variations', 'posts', 'written', 'say', 'spelling', 'see', 'hyphens', 'detect', 'reach', 'digits', 'words', 'found', 'common', 'examples', 'steps', 'shortened', 'representation', 'number', 'without', 'data', 'expand', 'canonical', 'media', 'lowercase', 'tools', 'formats', 'consider', 'way', 'collection', 'convert', 'ways', 'mapped', 'incorporate', 'code', 'forms', 'chapter', 'news', 'single', 'events', 'newspapers', 'phone', 'spelled', 'names', 'working', 'word', 'abbreviations', 'simple', 'scenario', 'source', 'preset', 'work', 'uppercase', 'dictionary', 'known', 'including', 'sometimes', 'nlp', '’', 'one', 'spacy', 'nine', 'captures', 'language', 'different', 'eg', 'social', 'text', 'normalization', 'developing', 'spellings'}


In [11]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('wordnet')
stemmer= PorterStemmer()

print("Before Stemming:")
print(tokens_no_stop)

print("After Stemming:")
for word in tokens_no_stop:
    print(stemmer.stem(word),end=" ")

[nltk_data] Downloading package wordnet to /Users/piotr/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Before Stemming:
{'convert', 'examples', 'written', 'one', 'steps', 'scenario', 'nine', 'uppercase', 'reach', 'dictionary', 'representation', 'language', 'simple', 'spacy', 'detect', 'phone', 'names', 'expand', 'media', 'shortened', 'without', 'captures', 'work', 'collection', 'news', 'working', 'spelled', 'including', 'common', 'ways', 'chapter', 'mapped', 'different', 'developing', 'useful', 'normalization', 'single', 'posts', 'tools', 'consider', 'forms', 'code', 'found', 'eg', 'data', 'digits', 'hyphens', 'text', 'see', 'known', 'events', 'words', 'number', 'sometimes', 'showing', 'newspapers', 'formats', 'preset', 'say', 'spelling', 'spellings', 'canonical', '’', 'source', 'nlp', 'way', 'social', 'variations', 'word', 'incorporate', 'abbreviations', 'lowercase'}
After Stemming:
convert exampl written one step scenario nine uppercas reach dictionari represent la

In [5]:
from nltk.stem import WordNetLemmatizer
#from nltk.tokenize import word_tokenize
# nltk.download('wordnet')
lemmatizer=WordNetLemmatizer()

for word in tokens_no_stop:
    print(lemmatizer.lemmatize(word),end=" ")



['showing', 'useful', 'variation', 'post', 'written', 'say', 'spelling', 'see', 'hyphen', 'detect', 'reach', 'digit', 'word', 'found', 'common', 'example', 'step', 'shortened', 'representation', 'number', 'without', 'data', 'expand', 'canonical', 'medium', 'lowercase', 'tool', 'format', 'consider', 'way', 'collection', 'convert', 'way', 'mapped', 'incorporate', 'code', 'form', 'chapter', 'news', 'single', 'event', 'newspaper', 'phone', 'spelled', 'name', 'working', 'word', 'abbreviation', 'simple', 'scenario', 'source', 'preset', 'work', 'uppercase', 'dictionary', 'known', 'including', 'sometimes', 'nlp', '’', 'one', 'spacy', 'nine', 'capture', 'language', 'different', 'eg', 'social', 'text', 'normalization', 'developing', 'spelling']


[('showing', 'VBG'),
 ('useful', 'JJ'),
 ('variation', 'NN'),
 ('post', 'NN'),
 ('written', 'VBN'),
 ('say', 'VBP'),
 ('spelling', 'VBG'),
 ('see', 'NN'),
 ('hyphen', 'NN'),
 ('detect', 'VB'),
 ('reach', 'NN'),
 ('digit', 'NN'),
 ('word', 'NN'),
 ('found', 'VBD'),
 ('common', 'JJ'),
 ('example', 'NN'),
 ('step', 'NN'),
 ('shortened', 'VBD'),
 ('representation', 'NN'),
 ('number', 'NN'),
 ('without', 'IN'),
 ('data', 'NNS'),
 ('expand', 'RB'),
 ('canonical', 'JJ'),
 ('medium', 'NN'),
 ('lowercase', 'NN'),
 ('tool', 'NN'),
 ('format', 'NN'),
 ('consider', 'VB'),
 ('way', 'NN'),
 ('collection', 'NN'),
 ('convert', 'VBP'),
 ('way', 'NN'),
 ('mapped', 'JJ'),
 ('incorporate', 'NN'),
 ('code', 'NN'),
 ('form', 'NN'),
 ('chapter', 'NN'),
 ('news', 'NN'),
 ('single', 'JJ'),
 ('event', 'NN'),
 ('newspaper', 'NN'),
 ('phone', 'NN'),
 ('spelled', 'VBD'),
 ('name', 'NN'),
 ('working', 'VBG'),
 ('word', 'NN'),
 ('abbreviation', 'NN'),
 ('simple', 'NN'),
 ('scenario', 'NN'),
 ('source', 'NN'),
 ('pre