## Text Normalization : SpaCy vs NLTK 

### SpaCy

In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

def spacy_fun(text):
    spacy_form = nlp(text)
    
    print('\nSpaCy Text Normalization:\n')
    
    #Tokenization and Lemmatization in spacy
    lem_word = []
    for i in spacy_form:
        lem_word.append(i.lemma_)
        
    print('Tokenization and Lemmatization in Spacy:\n')
    print(lem_word)
    
    #Removing stop words in the text
    stop_word = []
    for word in lem_word:
        vocab_word = nlp.vocab[word]
        if vocab_word.is_stop == False:
            stop_word.append(word)
    print('\nStop word regularization: \n')
    print(stop_word)
    
    #Removing punctuations in the text
    punctuation = '#?!-,.;:–/—'
    for word in stop_word:
        if word in punctuation:
            stop_word.remove(word)
    print("\nAfter removing punctuations\n")
    print(stop_word)

### NLTK libraries

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import *

ps = PorterStemmer()
lemma = WordNetLemmatizer()

def nlkt_fun(text):
    
    print('\nNLTK Text Normalization:\n')
    
    #Tokenizing the raw text
    word_token = word_tokenize(text)
    print('Tokenzing the raw text:\n')
    print(word_token)
    
    #Stemming the data
    word_stem = []
    for word in word_token:
        word_stem.append(ps.stem(word))
    print('\nStemming the data:\n')
    print(word_stem)
    
    #Lemmatizing the data
    word_lemma =[]
    for word2 in word_stem:
        word_lemma.append(lemma.lemmatize(word2))
    print('\nLemmatizing the data:\n')
    print(word_lemma)
    
    #Removing stop words from the list
    word_stop = []
    nltk_stop_words = set(stopwords.words('english'))
    for word3 in word_lemma:
        if word3 not in nltk_stop_words:
            word_stop.append(word3)
            
    print('\nRemoving stop words from the list:\n')
    print(word_stop)
    
    #Removing the punctuations from the data
    punctuation = '#?!-,.;:–/—'
    for word4 in word_stop:
        if word4 in punctuation:
            word_stop.remove(word4)
    print('\nRemoving the punctuations from the data:\n')
    print(word_stop)

### Testing

In [3]:
tweet1 = "#Millions of small businesses – especially those owned by women and people of color – are struggling to keep their doors open. Today, our Administration announced key changes to the Paycheck Protection Program that will help get relief to more small businesses across the country."
tweet2 = "Texas — If you’re without heat, head to http://tdem.texas.gov/warm to find a warming shelter near you or call 211 for additional assistance."
tweet3 = 'Our 7-day daily average of 1.7 million doses administered compares to an average of 892k the week before President Biden took office. That’s almost double in just four weeks. Kamala harris'

In [4]:
%%time
spacy_fun(tweet1)


SpaCy Text Normalization:

Tokenization and Lemmatization in Spacy:

['#', 'million', 'of', 'small', 'business', '–', 'especially', 'those', 'own', 'by', 'woman', 'and', 'people', 'of', 'color', '–', 'be', 'struggle', 'to', 'keep', 'their', 'door', 'open', '.', 'today', ',', 'our', 'Administration', 'announce', 'key', 'change', 'to', 'the', 'Paycheck', 'Protection', 'Program', 'that', 'will', 'help', 'get', 'relief', 'to', 'more', 'small', 'business', 'across', 'the', 'country', '.']

Stop word regularization: 

['#', 'million', 'small', 'business', '–', 'especially', 'woman', 'people', 'color', '–', 'struggle', 'door', 'open', '.', 'today', ',', 'Administration', 'announce', 'key', 'change', 'Paycheck', 'Protection', 'Program', 'help', 'relief', 'small', 'business', 'country', '.']

After removing punctuations

['million', 'small', 'business', 'especially', 'woman', 'people', 'color', 'struggle', 'door', 'open', 'today', 'Administration', 'announce', 'key', 'change', 'Paycheck', 'Pro

In [5]:
%%time
nlkt_fun(tweet1)


NLTK Text Normalization:

Tokenzing the raw text:

['#', 'Millions', 'of', 'small', 'businesses', '–', 'especially', 'those', 'owned', 'by', 'women', 'and', 'people', 'of', 'color', '–', 'are', 'struggling', 'to', 'keep', 'their', 'doors', 'open', '.', 'Today', ',', 'our', 'Administration', 'announced', 'key', 'changes', 'to', 'the', 'Paycheck', 'Protection', 'Program', 'that', 'will', 'help', 'get', 'relief', 'to', 'more', 'small', 'businesses', 'across', 'the', 'country', '.']

Stemming the data:

['#', 'million', 'of', 'small', 'busi', '–', 'especi', 'those', 'own', 'by', 'women', 'and', 'peopl', 'of', 'color', '–', 'are', 'struggl', 'to', 'keep', 'their', 'door', 'open', '.', 'today', ',', 'our', 'administr', 'announc', 'key', 'chang', 'to', 'the', 'paycheck', 'protect', 'program', 'that', 'will', 'help', 'get', 'relief', 'to', 'more', 'small', 'busi', 'across', 'the', 'countri', '.']

Lemmatizing the data:

['#', 'million', 'of', 'small', 'busi', '–', 'especi', 'those', 'own', 'b

In [6]:
%%time
spacy_fun(tweet2)


SpaCy Text Normalization:

Tokenization and Lemmatization in Spacy:

['Texas', '—', 'if', 'you', '’re', 'without', 'heat', ',', 'head', 'to', 'http://tdem.texas.gov/warm', 'to', 'find', 'a', 'warming', 'shelter', 'near', 'you', 'or', 'call', '211', 'for', 'additional', 'assistance', '.']

Stop word regularization: 

['Texas', '—', 'heat', ',', 'head', 'http://tdem.texas.gov/warm', 'find', 'warming', 'shelter', 'near', '211', 'additional', 'assistance', '.']

After removing punctuations

['Texas', 'heat', 'head', 'http://tdem.texas.gov/warm', 'find', 'warming', 'shelter', 'near', '211', 'additional', 'assistance']
Wall time: 19.9 ms


In [7]:
%%time
nlkt_fun(tweet2)


NLTK Text Normalization:

Tokenzing the raw text:

['Texas', '—', 'If', 'you', '’', 're', 'without', 'heat', ',', 'head', 'to', 'http', ':', '//tdem.texas.gov/warm', 'to', 'find', 'a', 'warming', 'shelter', 'near', 'you', 'or', 'call', '211', 'for', 'additional', 'assistance', '.']

Stemming the data:

['texa', '—', 'If', 'you', '’', 're', 'without', 'heat', ',', 'head', 'to', 'http', ':', '//tdem.texas.gov/warm', 'to', 'find', 'a', 'warm', 'shelter', 'near', 'you', 'or', 'call', '211', 'for', 'addit', 'assist', '.']

Lemmatizing the data:

['texa', '—', 'If', 'you', '’', 're', 'without', 'heat', ',', 'head', 'to', 'http', ':', '//tdem.texas.gov/warm', 'to', 'find', 'a', 'warm', 'shelter', 'near', 'you', 'or', 'call', '211', 'for', 'addit', 'assist', '.']

Removing stop words from the list:

['texa', '—', 'If', '’', 'without', 'heat', ',', 'head', 'http', ':', '//tdem.texas.gov/warm', 'find', 'warm', 'shelter', 'near', 'call', '211', 'addit', 'assist', '.']

Removing the punctuations 

In [8]:
%%time
spacy_fun(tweet3)


SpaCy Text Normalization:

Tokenization and Lemmatization in Spacy:

['our', '7', '-', 'day', 'daily', 'average', 'of', '1.7', 'million', 'dose', 'administer', 'compare', 'to', 'an', 'average', 'of', '892k', 'the', 'week', 'before', 'President', 'Biden', 'take', 'office', '.', 'that', '’', 'almost', 'double', 'in', 'just', 'four', 'week', '.', 'Kamala', 'harris']

Stop word regularization: 

['7', '-', 'day', 'daily', 'average', '1.7', 'million', 'dose', 'administer', 'compare', 'average', '892k', 'week', 'President', 'Biden', 'office', '.', '’', 'double', 'week', '.', 'Kamala', 'harris']

After removing punctuations

['7', 'day', 'daily', 'average', '1.7', 'million', 'dose', 'administer', 'compare', 'average', '892k', 'week', 'President', 'Biden', 'office', '’', 'double', 'week', 'Kamala', 'harris']
Wall time: 24.9 ms


In [9]:
%%time
nlkt_fun(tweet3)


NLTK Text Normalization:

Tokenzing the raw text:

['Our', '7-day', 'daily', 'average', 'of', '1.7', 'million', 'doses', 'administered', 'compares', 'to', 'an', 'average', 'of', '892k', 'the', 'week', 'before', 'President', 'Biden', 'took', 'office', '.', 'That', '’', 's', 'almost', 'double', 'in', 'just', 'four', 'weeks', '.', 'Kamala', 'harris']

Stemming the data:

['our', '7-day', 'daili', 'averag', 'of', '1.7', 'million', 'dose', 'administ', 'compar', 'to', 'an', 'averag', 'of', '892k', 'the', 'week', 'befor', 'presid', 'biden', 'took', 'offic', '.', 'that', '’', 's', 'almost', 'doubl', 'in', 'just', 'four', 'week', '.', 'kamala', 'harri']

Lemmatizing the data:

['our', '7-day', 'daili', 'averag', 'of', '1.7', 'million', 'dose', 'administ', 'compar', 'to', 'an', 'averag', 'of', '892k', 'the', 'week', 'befor', 'presid', 'biden', 'took', 'offic', '.', 'that', '’', 's', 'almost', 'doubl', 'in', 'just', 'four', 'week', '.', 'kamala', 'harri']

Removing stop words from the list:

['7

In [10]:
%%time
# Import spaCy and load the language library
import spacy
nlp = spacy.load('en_core_web_sm')

Example_Sentence = "Today, America is officially back in the Paris Climate Agreement. Let’s get to work.."

# Create a Doc object
doc = nlp(Example_Sentence)

# Print each token separately
for token in doc:
    print(token.text, token.pos_, token.dep_, token.lemma_)

Today NOUN npadvmod today
, PUNCT punct ,
America PROPN nsubj America
is AUX ROOT be
officially ADV advmod officially
back ADV advmod back
in ADP prep in
the DET det the
Paris PROPN compound Paris
Climate PROPN compound Climate
Agreement PROPN pobj Agreement
. PUNCT punct .
Let VERB ROOT let
’s PRON nsubj ’s
get VERB ccomp get
to ADP prep to
work NOUN pobj work
.. PUNCT punct ..
Wall time: 748 ms


In [11]:
print(nlp.Defaults.stop_words)
print(len(nlp.Defaults.stop_words))

{'about', 'together', 'part', 'please', 'often', 'there', '’d', 'him', 'may', 'that', 'using', 'though', 'thereby', 'thence', 'bottom', 'for', 'back', 'hereafter', 'at', 'is', 'take', 'than', 'yours', "'m", 'much', 'as', 'not', 'whereas', 'will', 'therefore', 'hereupon', 'serious', 'me', 'former', 'hereby', 'meanwhile', 'beforehand', 'its', 'nobody', 'they', 'move', 'an', 'been', 'myself', 'herself', 'either', 'after', 'anything', 'name', 'whither', 'i', 'well', 'too', 'cannot', 'since', 'else', '’s', 'twelve', 'while', 'third', 'least', 'also', 'moreover', 'those', 'whereupon', 'nowhere', 'could', 'nothing', 'almost', 'across', 'although', 'sometime', 'per', '‘re', 'nine', 'regarding', 'ourselves', 'thus', 'many', 'less', 'first', 'ever', 'of', 'behind', 'in', 'whereby', 'do', 'on', 'most', 'before', 'wherein', 'anywhere', 'wherever', 'this', 'a', 'once', "'s", 'amount', 'seem', '’re', 'can', 'made', 'these', 'hers', '‘ll', 'any', 'being', 'doing', 'again', 'anyone', 'own', 'perhaps',