# Preprocessing using NLTK and Spacy

## NLTK

In [1]:
import nltk
import re
import string

In [4]:
def lower_text(text):
    return text.lower()

input_str = "Weather is too Cloudy.Possiblity of Rain is High,Today!!"
lower_text(input_str)

'weather is too cloudy.possiblity of rain is high,today!!'

In [7]:
def remove_num(text):
    result = re.sub(r'\d+', '', text)
    return result 
input_s = "You bought 6 candies from shop, and 4 candies are in home."
remove_num(input_s)

'You bought  candies from shop, and  candies are in home.'

### Number to word

In [11]:
# import the library 
import inflect 
q = inflect.engine() 
  
# convert number into text 
def convert_num(text): 
    # split strings into list of texts 
    temp_string = text.split() 
    # initialise empty list 
    new_str = [] 
  
    for word in temp_string: 
        # if text is a digit, convert the digit 
        # to numbers and append into the new_str list 
        if word.isdigit(): 
            temp = q.number_to_words(word) 
            new_str.append(temp) 
  
        # append the texts as it is 
        else: 
            new_str.append(word) 
  
    # join the texts of new_str to form a string 
    temp_str = ' '.join(new_str) 
    return temp_str 
  
input_str = 'You bought 6 candies from shop, and 4 candies are in home.'
convert_num(input_str)

'You bought six candies from shop, and four candies are in home.'

In [2]:
def rem_punct(text): 
    translator = str.maketrans('', '', string.punctuation) 
    return text.translate(translator) 
  
input_str = "Hey, Are you excited??, After a week, we will be in Shimla!!!"
rem_punct(input_str)

'Hey Are you excited After a week we will be in Shimla'

In [8]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
#nltk.download('stopwords')
#nltk.download('punkt')
def rem_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text
    
ex_text = "Data is the new oil. A.I is the last invention"
rem_stopwords(ex_text)

['Data', 'new', 'oil', '.', 'A.I', 'last', 'invention']

In [10]:
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
stem1 = PorterStemmer()
def s_word(text):
    word_tokens = word_tokenize(text)
    stems = [stem1.stem(word) for word in word_tokens]
    return stems

text = 'Data is the new revolution in the World, in a day one individual would generate terabytes of data.'
s_word(text)

['data',
 'is',
 'the',
 'new',
 'revolut',
 'in',
 'the',
 'world',
 ',',
 'in',
 'a',
 'day',
 'one',
 'individu',
 'would',
 'gener',
 'terabyt',
 'of',
 'data',
 '.']

In [13]:
from nltk.stem import wordnet
from nltk.tokenize import word_tokenize
#nltk.download('wordnet')
lemma = wordnet.WordNetLemmatizer()
def lemm_word(text):
    word_tokens = word_tokenize(text)
    lemmas = [lemma.lemmatize(word, pos = 'v') for word in word_tokens]
    return lemmas

text = 'Data is the new revolution in the World, in a day one individual would generate terabytes of data.'
lemm_word(text)

['Data',
 'be',
 'the',
 'new',
 'revolution',
 'in',
 'the',
 'World',
 ',',
 'in',
 'a',
 'day',
 'one',
 'individual',
 'would',
 'generate',
 'terabytes',
 'of',
 'data',
 '.']

In [14]:
from nltk.tokenize import word_tokenize 
from nltk import pos_tag 
#nltk.download('averaged_perceptron_tagger')
  
# convert text into word_tokens with their tags 
def pos_tagg(text): 
    word_tokens = word_tokenize(text) 
    return pos_tag(word_tokens) 
  
pos_tagg('Are you afraid of something?') 

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Nabeel_Ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


[('Are', 'NNP'),
 ('you', 'PRP'),
 ('afraid', 'IN'),
 ('of', 'IN'),
 ('something', 'NN'),
 ('?', '.')]

In [15]:
from nltk.tokenize import word_tokenize  
from nltk import pos_tag 
  
# here we define chunking function with text and regular 
# expressions representing grammar as parameter 
def chunking(text, grammar): 
    word_tokens = word_tokenize(text) 
  
    # label words with pos 
    word_pos = pos_tag(word_tokens) 
  
    # create chunk parser using grammar 
    chunkParser = nltk.RegexpParser(grammar) 
  
    # test it on the list of word tokens with tagged pos 
    tree = chunkParser.parse(word_pos) 
      
    for subtree in tree.subtrees(): 
        print(subtree) 
    #tree.draw() 
      
sentence = 'the little red parrot is flying in the sky'
grammar = "NP: {<DT>?<JJ>*<NN>}"
chunking(sentence, grammar) 

(S
  (NP the/DT little/JJ red/JJ parrot/NN)
  is/VBZ
  flying/VBG
  in/IN
  (NP the/DT sky/NN))
(NP the/DT little/JJ red/JJ parrot/NN)
(NP the/DT sky/NN)


In [16]:
from nltk.tokenize import word_tokenize 
from nltk import pos_tag, ne_chunk 
#nltk.download('maxent_ne_chunker')
#nltk.download('words')
  
def ner(text): 
    # tokenize the text 
    word_tokens = word_tokenize(text) 
  
    # pos tagging of words 
    word_pos = pos_tag(word_tokens) 
  
    # tree of word entities 
    print(ne_chunk(word_pos)) 
  
text = 'Brain Lara scored the highest 400 runs in a test match which played in between WI and England.'
ner(text) 

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Nabeel_Ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Nabeel_Ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


(S
  (PERSON Brain/NNP)
  (PERSON Lara/NNP)
  scored/VBD
  the/DT
  highest/JJS
  400/CD
  runs/NNS
  in/IN
  a/DT
  test/NN
  match/NN
  which/WDT
  played/VBD
  in/IN
  between/IN
  (ORGANIZATION WI/NNP)
  and/CC
  (GPE England/NNP)
  ./.)


### find out all the domain name from given list

In [27]:
emails = ["xyz@gmail.com",
"abc@yahoo.com",
"xyz@hotmail.com",
"abc@ineuron.ai",
"xyz@outlook.com"]
def domain_extract(text):
    res = re.findall(r'@[\w.]+', text)
    print(res)

for email in emails:
    domain_extract(email)

['@gmail.com']
['@yahoo.com']
['@hotmail.com']
['@ineuron.ai']
['@outlook.com']


### Create one python program in which you have to lowercase the sentence first and than delete digits from the following sentence.

In [42]:
x = "In India, 184 people got affected with Corona virus and 4 are died."
def solution(text):
    lower = text.lower().split()
    output = [w for w in lower if w.isalpha() ]
    print(output)
    
solution(x)

['in', 'people', 'got', 'affected', 'with', 'corona', 'virus', 'and', 'are']


### Do stemming, lemmatization and tokenization from the following sentence.

In [46]:
sent = 'I hope that, when I have built up my savings, I will be able to travel to Hawai.'
from nltk.tokenize import word_tokenize
from nltk.stem import wordnet #for lemmatiztion
from nltk.stem.porter import PorterStemmer #for stemming

##stemming
stem2 = stem1 = PorterStemmer()

##lemmatization
#nltk.download('wordnet')
lemma = wordnet.WordNetLemmatizer()

def lemma_stemm(text):
    word_tokens = word_tokenize(text)
    stems = [stem1.stem(word) for word in word_tokens]
    lemmas = [lemma.lemmatize(word, pos = 'v') for word in word_tokens]
    return stems, lemmas
lemma_stemm(sent)
    

(['i',
  'hope',
  'that',
  ',',
  'when',
  'i',
  'have',
  'built',
  'up',
  'my',
  'save',
  ',',
  'i',
  'will',
  'be',
  'abl',
  'to',
  'travel',
  'to',
  'hawai',
  '.'],
 ['I',
  'hope',
  'that',
  ',',
  'when',
  'I',
  'have',
  'build',
  'up',
  'my',
  'save',
  ',',
  'I',
  'will',
  'be',
  'able',
  'to',
  'travel',
  'to',
  'Hawai',
  '.'])

### Create one python program from the following sentence.

"I love NLP, not you"

output : ['I', 'l', 'N', 'n', 'y']

In [51]:
a = "I love NLP, not you"
from nltk.tokenize import word_tokenize

def answer(text):
    tokens = word_tokenize(text)
    out = [w[0] for w in tokens if w.isalpha()]
    return out

answer(a)

['I', 'l', 'N', 'n', 'y']

## Spacy

In [52]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for token in doc:
    print(token.text, token.pos_, token.dep_)



Apple PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K. PROPN dobj
startup VERB dep
for ADP prep
$ SYM quantmod
1 NUM compound
billion NUM pobj


In [53]:
for token in doc:
    print(token.text)

Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion


In [54]:
doc = nlp("Coronavirus: Delhi resident tests positive for coronavirus, total 31 people infected in India")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Coronavirus Coronavirus PROPN NNP nsubj Xxxxx True False
: : PUNCT : punct : False False
Delhi Delhi PROPN NNP compound Xxxxx True False
resident resident NOUN NN compound xxxx True False
tests test NOUN NNS appos xxxx True False
positive positive ADJ JJ amod xxxx True False
for for ADP IN prep xxx True True
coronavirus coronavirus NOUN NN pobj xxxx True False
, , PUNCT , punct , False False
total total ADJ JJ ROOT xxxx True False
31 31 NUM CD nummod dd False False
people people NOUN NNS dobj xxxx True False
infected infect VERB VBN acl xxxx True False
in in ADP IN prep xx True True
India India PROPN NNP pobj Xxxxx True False


In [55]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Google, Apple crack down on fake coronavirus apps")
displacy.serve(doc, style="dep")




Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [56]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Coronavirus: Delhi resident tests positive for coronavirus, total 31 people infected in India")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Delhi 13 18 GPE
31 66 68 CARDINAL
India 88 93 GPE


In [57]:
import spacy
from spacy import displacy

text = "Coronavirus: Delhi resident tests positive for coronavirus, total 31 people infected in India"

nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
displacy.serve(doc, style="ent")
# https://spacy.io/api/annotation#named-entities


Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [59]:
# !python -m spacy download en_core_web_md
import spacy.cli
spacy.cli.download("en_core_web_md")
import en_core_web_md
nlp = en_core_web_md.load()

✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_md')


In [60]:
import spacy

nlp = spacy.load("en_core_web_md")
tokens = nlp("lion bear apple banana fadsfdshds")

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)
# Vector norm: The L2 norm of the token’s vector (the square root of the sum of the values squared)
# has vector: Does the token have a vector representation?
# OOV: Out-of-vocabulary

lion True 6.6788154 False
bear True 7.2436275 False
apple True 6.895898 False
banana True 6.895898 False
fadsfdshds False 0.0 True


In [61]:
import spacy

nlp = spacy.load("en_core_web_md")  # make sure to use larger model!
tokens = nlp("lion bear cow apple mango spinach")

for token11 in tokens:
    for token13 in tokens:
        print(token11.text, token13.text, token11.similarity(token13))

lion lion 1.0
lion bear 0.42525938153266907
lion cow 0.5135680437088013
lion apple 0.25588127970695496
lion mango 0.3112117648124695
lion spinach 0.2844249904155731
bear lion 0.42525938153266907
bear bear 1.0
bear cow 0.5596494078636169
bear apple 0.2901766002178192
bear mango 0.18352437019348145
bear spinach 0.11630471050739288
cow lion 0.5135680437088013
cow bear 0.5596494078636169
cow cow 1.0
cow apple 0.3741353750228882
cow mango 0.359701007604599
cow spinach 0.3181520104408264
apple lion 0.25588127970695496
apple bear 0.2901766002178192
apple cow 0.3741353750228882
apple apple 1.0
apple mango 0.5986488461494446
apple spinach 0.6040376424789429
mango lion 0.3112117648124695
mango bear 0.18352437019348145
mango cow 0.359701007604599
mango apple 0.5986488461494446
mango mango 1.0
mango spinach 0.7843544483184814
spinach lion 0.2844249904155731
spinach bear 0.11630471050739288
spinach cow 0.3181520104408264
spinach apple 0.6040376424789429
spinach mango 0.7843544483184814
spinach spin

# Keras

In [5]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence

In [6]:
text = 'The quick brown fox jumped over the lazy dog.'
# tokenize the document
result = text_to_word_sequence(text)
print(result)

['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']


In [3]:
import tensorflow as tf
print(tensorflow.__version__)

2.11.0


In [8]:
from keras.preprocessing.text import Tokenizer
# define 5 documents
docs = ['Well done!','Good work','Great effort','nice work','Excellent!']
# create the tokenizer
t = Tokenizer()
# fit the tokenizer on the documents
t.fit_on_texts(docs)

<keras.preprocessing.text.Tokenizer object at 0x0000021A30443D30>


In [9]:
# summarize what was learned
print(t.word_counts)
print(t.document_count)
print(t.word_index)
print(t.word_docs)

OrderedDict([('well', 1), ('done', 1), ('good', 1), ('work', 2), ('great', 1), ('effort', 1), ('nice', 1), ('excellent', 1)])
5
{'work': 1, 'well': 2, 'done': 3, 'good': 4, 'great': 5, 'effort': 6, 'nice': 7, 'excellent': 8}
defaultdict(<class 'int'>, {'done': 1, 'well': 1, 'good': 1, 'work': 2, 'great': 1, 'effort': 1, 'nice': 1, 'excellent': 1})


In [10]:
from keras.preprocessing.text import Tokenizer
# define 5 documents
docs = ['Well done!','Good work','Great effort','nice work','Excellent!']
# create the tokenizer
t = Tokenizer()
# fit the tokenizer on the documents
t.fit_on_texts(docs)
# summarize what was learned
print(t.word_counts)
print(t.document_count)
print(t.word_index)
print(t.word_docs)
# integer encode documents
encoded_docs = t.texts_to_matrix(docs, mode='count')
print(encoded_docs)

OrderedDict([('well', 1), ('done', 1), ('good', 1), ('work', 2), ('great', 1), ('effort', 1), ('nice', 1), ('excellent', 1)])
5
{'work': 1, 'well': 2, 'done': 3, 'good': 4, 'great': 5, 'effort': 6, 'nice': 7, 'excellent': 8}
defaultdict(<class 'int'>, {'done': 1, 'well': 1, 'good': 1, 'work': 2, 'great': 1, 'effort': 1, 'nice': 1, 'excellent': 1})
[[0. 0. 1. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]]
