In [60]:
text1 = "Humans, in their youth are very smart."
nltk.word_tokenize(text1)

['Humans', ',', 'in', 'their', 'youth', 'are', 'very', 'smart', '.']

# Named Entity Recognition using NLTK and SpaCy

## Using NLTK
1. Word Tokenization
2. Sentence Tokenization
3. Stopwords
4. Part of Speech (POS) Tagging
5. Chunking with NLTK
6. Chinking with NLTK
7. Lemmatization
8. Named Entity Recognition - NER

## Using SpaCy
1. CV review

## Tokenize
### Word Tokenize - returns a set of words in a sentence
### Sentence Tokenize - returns chunk of sentence in a document

In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

EXAMPLE_TEXT = "Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard."

print(sent_tokenize(EXAMPLE_TEXT))
	  


['Hello Mr. Smith, how are you doing today?', 'The weather is great, and Python is awesome.', 'The sky is pinkish-blue.', "You shouldn't eat cardboard."]


### Stop words

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

example_sent = "This is a sample sentence, showing off the stop words filtration."

stop_words = set(stopwords.words('english'))

word_tokens = word_tokenize(example_sent)

filtered_sentence = [w for w in word_tokens if not w in stop_words]

filtered_sentence = []

for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)

print(word_tokens)
print(filtered_sentence)

['This', 'is', 'a', 'sample', 'sentence', ',', 'showing', 'off', 'the', 'stop', 'words', 'filtration', '.']
['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


###  Stemming 
#### refers to the process of stripping suffixes from words in attempt to normalize them and reduce them to their non-changing portion. 

In [3]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()

In [4]:
example_words = ["python","pythoner","pythoning","pythoned","pythonly"]

In [5]:
for w in example_words:
    print(ps.stem(w))

python
python
python
python
pythonli


In [6]:
new_text = "It is important to by very pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."

In [7]:
words = word_tokenize(new_text)

for w in words:
    print(ps.stem(w))

It
is
import
to
by
veri
pythonli
while
you
are
python
with
python
.
all
python
have
python
poorli
at
least
onc
.


### POS Tagger: Labels words with their corresponding parts of speech

## NLTK POS tag list:

CC	coordinating conjunction

CD	cardinal digit

DT	determiner

EX	existential there (like: "there is" ... think of it like "there exists")

FW	foreign word

IN	preposition/subordinating conjunction

JJ	adjective	'big'

JJR	adjective, comparative	'bigger'

JJS	adjective, superlative	'biggest'

LS	list marker	1)

MD	modal	could, will

NN	noun, singular 'desk'

NNS	noun plural	'desks'

NNP	proper noun, singular	'Harrison'

NNPS	proper noun, plural	'Americans'

PDT	predeterminer	'all the kids'

POS	possessive ending	parent\'s

PRP	personal pronoun	I, he, she

PRP$	possessive pronoun	my, his, hers

RB	adverb	very, silently,

RBR	adverb, comparative	better

RBS	adverb, superlative	best

RP	particle	give up

TO	to	go 'to' the store.

UH	interjection	errrrrrrrm

VB	verb, base form	take

VBD	verb, past tense	took

VBG	verb, gerund/present participle	taking

VBN	verb, past participle	taken

VBP	verb, sing. present, non-3d	take

VBZ	verb, 3rd person sing. present	takes

WDT	wh-determiner	which

WP	wh-pronoun	who, what

WP$	possessive wh-pronoun	whose

WRB	wh-abverb	where, when



In [8]:
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)

    except Exception as e:
        print(str(e))


process_content()


[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

## Chunking

In [9]:
def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)
            
            print(chunked)
            for subtree in chunked.subtrees(filter=lambda t: t.label() == 'Chunk'):
                print(subtree)

            chunked.draw()

    except Exception as e:
        print(str(e))

process_content()


(S
  (Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
  'S/POS
  (Chunk ADDRESS/NNP)
  BEFORE/IN
  (Chunk A/NNP JOINT/NNP SESSION/NNP)
  OF/IN
  (Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
  OF/IN
  (Chunk THE/NNP UNION/NNP January/NNP)
  31/CD
  ,/,
  2006/CD
  (Chunk THE/NNP PRESIDENT/NNP)
  :/:
  (Chunk Thank/NNP)
  you/PRP
  all/DT
  ./.)
(Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
(Chunk ADDRESS/NNP)
(Chunk A/NNP JOINT/NNP SESSION/NNP)
(Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
(Chunk THE/NNP UNION/NNP January/NNP)
(Chunk THE/NNP PRESIDENT/NNP)
(Chunk Thank/NNP)
(S
  (Chunk Mr./NNP Speaker/NNP)
  ,/,
  (Chunk Vice/NNP President/NNP Cheney/NNP)
  ,/,
  members/NNS
  of/IN
  (Chunk Congress/NNP)
  ,/,
  members/NNS
  of/IN
  the/DT
  (Chunk Supreme/NNP Court/NNP)
  and/CC
  diplomatic/JJ
  corps/NN
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  and/CC
  fellow/JJ
  citizens/NNS
  :/:
  Today/VB
  our/PRP$
  nation/NN
  lost/VBD
  a/DT
  beloved/VBN
  ,/,
  g

## Chinking

In [10]:
def process_content():
    try:
        for i in tokenized[:2]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)

            chunkGram = r"""Chunk: {<.*>+}
                                    }<VB.?|IN|DT|TO>+{"""

            chunkParser = nltk.RegexpParser(chunkGram)
            chunked = chunkParser.parse(tagged)

            chunked.draw()

    except Exception as e:
        print(str(e))

process_content()


## Lemmatizer
#### lemmatizer is stemming but it takes the context of the word into account

In [11]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("cats"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
print(lemmatizer.lemmatize("rocks"))
print(lemmatizer.lemmatize("python"))
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))
print(lemmatizer.lemmatize("run"))
print(lemmatizer.lemmatize("run",'v'))

cat
cactus
goose
rock
python
good
best
run
run


### NER using NLTK

In [12]:
def process_content():
    try:
        for i in tokenized[:3]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt = nltk.ne_chunk(tagged, binary=False)
            namedEnt.draw()
    except Exception as e:
        print(str(e))


process_content()

## Using SpaCy

In [34]:
import spacy
from spacy import displacy
import random
#!pip install PyPDF2
import PyPDF2
import requests
import json
nlp = spacy.load("en_core_web_sm")

In [14]:
doc1 = nlp('John lives in New York, where is Mr.Abhishek its already 11PM, he likes burger so he went to KFC')
for i in doc1.ents:
  print(i.text,'-',i.label_)

John - PERSON
New York - GPE
Abhishek - PERSON
11PM - CARDINAL


In [15]:
for token in doc1:
     print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

John John PROPN NNP nsubj Xxxx True False
lives live VERB VBZ ccomp xxxx True False
in in ADP IN prep xx True True
New New PROPN NNP compound Xxx True False
York York PROPN NNP pobj Xxxx True False
, , PUNCT , punct , False False
where where SCONJ WRB advmod xxxx True True
is be AUX VBZ relcl xx True True
Mr. Mr. PROPN NNP compound Xx. False False
Abhishek Abhishek PROPN NNP nsubj Xxxxx True False
its its PRON PRP$ poss xxx True True
already already ADV RB advmod xxxx True True
11PM 11pm NUM CD appos ddXX False False
, , PUNCT , punct , False False
he he PRON PRP nsubj xx True True
likes like VERB VBZ ROOT xxxx True False
burger burger NOUN NN dobj xxxx True False
so so SCONJ IN mark xx True True
he he PRON PRP nsubj xx True True
went go VERB VBD advcl xxxx True False
to to ADP IN prep xx True True
KFC KFC PROPN NNP pobj XXX True False


In [16]:
spacy.explain("CARDINAL")

'Numerals that do not fall under another type'

In [17]:
displacy.render(doc1, style='ent', jupyter=True)

In [18]:
displacy.render(nlp(str(doc1[0:])), style='dep', jupyter = True, options = {'distance': 120})

## Text Preprocessing

import sys,fitz

fname = '/kaggle/input/dataset-for-resume-information-retrieval/Alice Clark CV.pdf'

doc= fitz.open(fname)

alice_cv=""

for page in doc:

  alice_cv = alice_cv + str(page.getText())

print(alice_cv)


In [57]:
file1 = open("text1.txt").read()
file2 = open("text2.txt").read()
file3 = open("text3.txt").read()
file4 = open("text4.txt").read()
file5 = open("text5.txt").read()
myFile1 = nlp(file1)
myFile2 = nlp(file2)
myFile3 = nlp(file3)
myFile4 = nlp(file4)
myFile5 = nlp(file5)

In [58]:
displacy.render(myFile1, style='ent', jupyter=True)
#displacy.render(myFile1, style='ent')

In [59]:
displacy.render(myFile2, style='ent', jupyter=True)

In [60]:
displacy.render(myFile3, style='ent', jupyter=True)

In [61]:
displacy.render(myFile4, style='ent', jupyter=True)

In [62]:
displacy.render(myFile5, style='ent', jupyter=True)

In [41]:
displacy.render(nlp(str(doc_file[0:])), style='dep', jupyter = True, options = {'distance': 120})

In [28]:
fFileObj = open("shortstories.pdf", 'rb')
pdfReader = PyPDF2.PdfFileReader(fFileObj)
pageObj = pdfReader.getPage(0)
print("Total Pages : {} ".format(pdfReader.numPages))

resume = pageObj.extractText()

Total Pages : 5 


## Custom Name Entity Recognition on PDF Resume using NLP and spacy

In [19]:
class EntityGenerator(object):
    
    _slots__ = ['text']
    
    def __init__(self, text=None):
        self.text = text
        
    def get(self):
        """
        Return a Json
        """
        nlp = spacy.load("en_core_web_sm")
        doc = nlp(self.text)
        text = [ent.text for ent in doc.ents]
        entity = [ent.label_ for ent in doc.ents]
    
        from collections import Counter
        import json

        data = Counter(zip(entity))
        unique_entity = list(data.keys())
        unique_entity = [x[0] for x in unique_entity]

        d = {}
        for val in unique_entity:
            d[val] = []

        for key,val in dict(zip(text, entity)).items():
            if val in unique_entity:
                d[val].append(key)
        return d

In [20]:
class Resume(object):
    def __init__(self, filename=None):
        self.filename = filename
        
    def get(self):
        """
        
        """
        fFileObj = open(self.filename, 'rb')
        pdfReader = PyPDF2.PdfFileReader(fFileObj)
        pageObj = pdfReader.getPage(0)
        print("Total Pages : {} ".format(pdfReader.numPages))

        resume = pageObj.extractText()
        return resume
        

In [21]:
resume = Resume(filename="shortstories.pdf")
response_news = resume.get()

Total Pages : 5 


In [24]:
displacy.render(resume, style='ent', jupyter=True)

TypeError: 'Resume' object is not iterable

In [22]:
helper = EntityGenerator(text=response_news)
response = helper.get()
print(json.dumps(response , indent=3))

{}


In [None]:
train_data = [('sentence with entity to be recognised', {'entities': [(21, 25, 'entityLabel')]}), 
              ('sentence with entity to be recognised', {'entities': [(21, 25, 'entityLabel')]}), 
              ('sentence with entity to be recognised', {'entities': [(21, 25, 'entityLabel')]})]

In [None]:
def train_spacy(data,iterations):
    TRAIN_DATA = data
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
       

    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print("Statring iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
    return nlp


prdnlp = train_spacy(TRAIN_DATA, 20)

# Save our trained Model
modelfile = input("Enter your Model Name: ")
prdnlp.to_disk(modelfile)

#Test your text
test_text = input("Enter your testing text: ")
doc = prdnlp(test_text)
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)