In [None]:
!pip install --user pip setuptools wheel
!pip install --user spacy
python -m spacy download en_core_web_sm

In [1]:
import spacy 
import spacy.displacy as displacy
import pandas as pd 

In [2]:
nlp = spacy.load('en_core_web_sm')


In [None]:
text = """
Facebook is an American online social media and social networking service owned by Meta Platforms. Founded in 2004 by Mark Zuckerberg with fellow Harvard College students and roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz, and Chris Hughes, its name comes from the face book directories often given to American university students. Membership was initially limited to Harvard students, gradually expanding to other North American universities and, since 2006, anyone over 13 years old. As of 2020, Facebook claimed 2.8 billion monthly active users,[2] and ranked seventh in global internet usage.[7] It was the most downloaded mobile app of the 2010s.
"""

In [None]:
doc = nlp(text)
print(doc)


Facebook is an American online social media and social networking service owned by Meta Platforms. Founded in 2004 by Mark Zuckerberg with fellow Harvard College students and roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz, and Chris Hughes, its name comes from the face book directories often given to American university students. Membership was initially limited to Harvard students, gradually expanding to other North American universities and, since 2006, anyone over 13 years old. As of 2020, Facebook claimed 2.8 billion monthly active users,[2] and ranked seventh in global internet usage.[7] It was the most downloaded mobile app of the 2010s.



In [None]:
for token in doc:
  print(token.text, token.pos_, token.tag_, token.dep_, token.lemma_, token.is_stop)


 SPACE _SP  
 False
Facebook PROPN NNP nsubj Facebook False
is AUX VBZ ROOT be True
an DET DT det an True
American ADJ JJ amod american False
online ADJ JJ amod online False
social ADJ JJ amod social False
media NOUN NNS attr medium False
and CCONJ CC cc and True
social ADJ JJ amod social False
networking NOUN NN compound networking False
service NOUN NN conj service False
owned VERB VBN acl own False
by ADP IN agent by True
Meta PROPN NNP compound Meta False
Platforms PROPN NNPS pobj Platforms False
. PUNCT . punct . False
Founded VERB VBN advcl found False
in ADP IN prep in True
2004 NUM CD pobj 2004 False
by ADP IN agent by True
Mark PROPN NNP compound Mark False
Zuckerberg PROPN NNP pobj Zuckerberg False
with ADP IN prep with True
fellow ADJ JJ compound fellow False
Harvard PROPN NNP compound Harvard False
College PROPN NNP compound College False
students NOUN NNS pobj student False
and CCONJ CC cc and True
roommates VERB VBZ conj roommate False
Eduardo PROPN NNP compound Eduardo 

In [None]:
text_details = pd.DataFrame([(token.text, token.pos_, token.tag_, token.dep_, token.lemma_, token.is_stop) for token in doc], 
                            columns = ['Text', 'Pos', 'tag', 'dep', 'lemm', 'stop_bool'])

In [None]:
text_details

Unnamed: 0,Text,Pos,tag,dep,lemm,stop_bool
0,\n,SPACE,_SP,,\n,False
1,Facebook,PROPN,NNP,nsubj,Facebook,False
2,is,AUX,VBZ,ROOT,be,True
3,an,DET,DT,det,an,True
4,American,ADJ,JJ,amod,american,False
...,...,...,...,...,...,...
111,of,ADP,IN,prep,of,True
112,the,DET,DT,det,the,True
113,2010s,NUM,CD,pobj,2010s,False
114,.,PUNCT,.,punct,.,False


In [None]:
entity_details = pd.DataFrame([(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents] , 
                              columns = ['Text', 'Start', 'end', 'label'])

In [None]:
entity_details

Unnamed: 0,Text,Start,end,label
0,American,16,24,NORP
1,Meta Platforms,84,98,ORG
2,2004,111,115,DATE
3,Mark Zuckerberg,119,134,PERSON
4,Harvard College,147,162,ORG
5,Eduardo Saverin,186,201,PERSON
6,Andrew McCollum,203,218,PERSON
7,Dustin Moskovitz,220,236,PERSON
8,Chris Hughes,242,254,PERSON
9,American,317,325,NORP


In [None]:
displacy.render(doc, style='dep', jupyter=True)

In [None]:
displacy.render(doc, style='ent', jupyter=True)

In [None]:
text = """
The COVID-19 pandemic in India is a part of the worldwide pandemic of coronavirus disease 2019 (COVID-19) caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2). As of 27 September 2021, according to official figures, India has the second-highest number of confirmed cases in the world (after the United States of America) with 33,678,786 reported cases of COVID-19 infection and the third-highest number of COVID-19 deaths (after the United States and Brazil) at 482,017[4] deaths.[6][7][8] However these figures exhibit severe under-reporting.
"""

In [None]:
displacy.render(nlp(text), style='ent', jupyter=True)

In [7]:
train_dataset = [
         ("Money transfer from my savings account is not working", {"entities": [(0, 14, "ACTIVITY"), (23, 38, 'PRODUCT')]}),
         ("I want to check balance in my savings account", {"entities": [(10, 15, "ACTIVITY"), (30, 45, 'PRODUCT')]}),
         ("I suspect a fraud in my credit card account", {"entities": [(12, 17, "ACTIVITY"), (24, 35, 'PRODUCT')]}),
         ("I am here for opening a new savings account", {"entities": [(14, 21, "ACTIVITY"), (28, 43, 'PRODUCT')]}),
         ("Your mortgage is in delinquent status", {"entities": [(20, 30, "ACTIVITY"), (5, 13, 'PRODUCT')]}),
         ("Your credit card is in past due status", {"entities": [(30, 41, "ACTIVITY"), (5, 16, 'PRODUCT')]}),
         ("My loan account is still not approved and funded", {"entities": [(25, 37, "ACTIVITY"), (3, 15, 'PRODUCT'), (42, 48, "ACTIVITY")]}),
         ("How do I open a new loan account", {"entities": [(9, 13, "ACTIVITY"), (20, 32, 'PRODUCT')]}),
         ("What are the charges on Investment account", {"entities": [(13, 20, "ACTIVITY"), (24, 42, 'PRODUCT')]}),
         ("Can you explain late charges on my credit card", {"entities": [(16, 28, "ACTIVITY"), (35, 49, 'PRODUCT')]}),
         ("I want to open a new loan account", {"entities": [(10, 14, "ACTIVITY"), (21, 33, 'PRODUCT')]}),
         ("Can you help updating payment on my credit card", {"entities": [(22, 29, "ACTIVITY"), (36, 47, 'PRODUCT')]}),
         ("When is the payment due date on my card", {"entities": [(12, 19, "ACTIVITY"), (35, 39, 'PRODUCT')]})
        ]

In [3]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [5]:
ner = nlp.get_pipe('ner')

In [8]:
for _, annotations in train_dataset:
  print(annotations)
  for ent in annotations.get('entities'):
    ner.add_label(ent[2])

{'entities': [(0, 14, 'ACTIVITY'), (23, 38, 'PRODUCT')]}
{'entities': [(10, 15, 'ACTIVITY'), (30, 45, 'PRODUCT')]}
{'entities': [(12, 17, 'ACTIVITY'), (24, 35, 'PRODUCT')]}
{'entities': [(14, 21, 'ACTIVITY'), (28, 43, 'PRODUCT')]}
{'entities': [(20, 30, 'ACTIVITY'), (5, 13, 'PRODUCT')]}
{'entities': [(30, 41, 'ACTIVITY'), (5, 16, 'PRODUCT')]}
{'entities': [(25, 37, 'ACTIVITY'), (3, 15, 'PRODUCT'), (42, 48, 'ACTIVITY')]}
{'entities': [(9, 13, 'ACTIVITY'), (20, 32, 'PRODUCT')]}
{'entities': [(13, 20, 'ACTIVITY'), (24, 42, 'PRODUCT')]}
{'entities': [(16, 28, 'ACTIVITY'), (35, 49, 'PRODUCT')]}
{'entities': [(10, 14, 'ACTIVITY'), (21, 33, 'PRODUCT')]}
{'entities': [(22, 29, 'ACTIVITY'), (36, 47, 'PRODUCT')]}
{'entities': [(12, 19, 'ACTIVITY'), (35, 39, 'PRODUCT')]}


In [9]:
import random
import numpy as np
from spacy.util import minibatch, compounding


In [11]:
disable_pipe = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*disable_pipe):
  optimizer = nlp.resume_training()
  for iteration in range(0, 100):
    random.shuffle(train_dataset)
    losses = {}
    for batch in spacy.util.minibatch(train_dataset, size=compounding(1,16, 1.001)):
      for text, annotations in batch:
        nlp.update([text], [annotations], drop=0.5, losses=losses, sgd= optimizer)
        print("losses {}".format(losses))

losses {'ner': 8.88503921125422}
losses {'ner': 20.891683002177544}
losses {'ner': 30.18812134952615}
losses {'ner': 38.80134700748469}
losses {'ner': 50.09378820814187}
losses {'ner': 60.90382737078137}
losses {'ner': 70.30730529286332}
losses {'ner': 81.06249503774764}
losses {'ner': 87.46021834966409}
losses {'ner': 96.24490115997064}
losses {'ner': 104.92089327235193}
losses {'ner': 110.31069802621933}
losses {'ner': 120.95240508894905}
losses {'ner': 11.232492223381996}
losses {'ner': 21.15143851423636}
losses {'ner': 27.06145025690298}
losses {'ner': 38.24009658343433}
losses {'ner': 47.28686550540018}
losses {'ner': 51.43353313015117}
losses {'ner': 58.22165807461291}
losses {'ner': 64.18456329693012}
losses {'ner': 69.75351712022834}
losses {'ner': 78.27102706248647}
losses {'ner': 87.8504364783348}
losses {'ner': 97.244621125521}
losses {'ner': 108.09519595420532}
losses {'ner': 7.437473441331974}
losses {'ner': 13.499311512834254}
losses {'ner': 22.559341335294164}
losses {'n

In [12]:
for text, _ in train_dataset:
  doc = nlp(text)
  print("entities", [(i.text, i.label_) for i in doc.ents])

entities [('loan account', 'PRODUCT'), ('funded', 'ACTIVITY')]
entities [('open', 'ACTIVITY'), ('loan account', 'PRODUCT')]
entities [('credit card', 'PRODUCT')]
entities [('savings account', 'PRODUCT')]
entities [('credit card', 'PRODUCT')]
entities [('mortgage', 'PRODUCT'), ('delinquent', 'ACTIVITY')]
entities [('Investment account', 'PRODUCT')]
entities [('savings account', 'PRODUCT')]
entities [('credit card', 'PRODUCT')]
entities [('open', 'ACTIVITY'), ('loan account', 'PRODUCT')]
entities []
entities [('Money transfer', 'ACTIVITY'), ('savings account', 'PRODUCT')]
entities [('credit card', 'PRODUCT')]


In [13]:
from spacy import displacy
doc = nlp("I need to apply for credit card")
displacy.render(nlp(doc.text), style='ent', jupyter=True)

In [15]:
from spacy import displacy
doc = nlp("I want to open a current account for my business")
displacy.render(nlp(doc.text), style='ent', jupyter=True)

In [16]:
from spacy import displacy
doc = nlp("I want to open a demat account for my business")
displacy.render(nlp(doc.text), style='ent', jupyter=True)