## Custom NER training in Spacy

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [4]:
doc = nlp("Australia wants to force Facebook and Google to pay media companies for news")

In [6]:
for ent in doc.ents:   # getting all the entities identified
    print(ent.text, ent.start_char, ent.end_char, ent.label_)    #labels for the entities

Australia 0 9 GPE
Facebook 25 33 ORG
Google 38 44 ORG


In [8]:
doc = nlp("I want to buy a television from the next amazon sale in december")
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

amazon 41 47 ORG
december 56 64 DATE


In [9]:
doc = nlp("what is the process to open a new savings account")
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)  # does not recognize any NER - problems with financial/banking domains

In [11]:
ner = nlp.get_pipe('ner')   #NER pipeline

In [23]:
TRAIN_DATA = [
              ("Walmart is a leading e-commerce company", {"entities": [(0, 7, "ORG")]}),
              ("I reached Chennai yesterday.", {"entities": [(19, 28, "GPE")]}),
              ("I recently ordered a book from Amazon", {"entities": [(24,32, "ORG")]}),
              ("I was driving a BMW", {"entities": [(16,19, "PRODUCT")]}),
              ("I ordered this from ShopClues", {"entities": [(20,29, "ORG")]}),
              ("Fridge can be ordered in Amazon ", {"entities": [(0,6, "PRODUCT")]}),
              ("I bought a new Washer", {"entities": [(16,22, "PRODUCT")]}),
              ("I bought a old table", {"entities": [(16,21, "PRODUCT")]}),
              ("I bought a fancy dress", {"entities": [(18,23, "PRODUCT")]}),
              ("I rented a camera", {"entities": [(12,18, "PRODUCT")]}),
              ("I rented a tent for our trip", {"entities": [(12,16, "PRODUCT")]}),
              ("I rented a screwdriver from our neighbour", {"entities": [(12,22, "PRODUCT")]}),
              ("I repaired my computer", {"entities": [(15,23, "PRODUCT")]}),
              ("I got my clock fixed", {"entities": [(16,21, "PRODUCT")]}),
              ("I got my truck fixed", {"entities": [(16,21, "PRODUCT")]}),
              ("Flipkart started it's journey from zero", {"entities": [(0,8, "ORG")]}),
              ("I recently ordered from Max", {"entities": [(24,27, "ORG")]}),
              ("Flipkart is recognized as leader in market",{"entities": [(0,8, "ORG")]}),
              ("I recently ordered from Swiggy", {"entities": [(24,29, "ORG")]})
              ]

In [12]:
TRAIN_DATA2 = {"classes":["NAME","COMPANY","ROLE","PERCENTAGE","WEEKDAY","MONEY","TIME PERIOD","STATES"],"annotations":[["The share price of PVR rose over 7 percent on Wednesday after the multiplex chain said that it has reduced losses in Q2 despite nil revenue from the core movie exhibition business.",{"entities":[[19,22,"COMPANY"],[33,42,"PERCENTAGE"],[46,55,"WEEKDAY"]]}],["The company managed to get rent waivers from most landlords, CFO Nitin Sood said in an interview to CNBC-TV18. “The big focus for us right now as revenues have been nil is to really reduce our fixed cost and we have managed to do that, ” he added.",{"entities":[[61,64,"ROLE"],[65,75,"NAME"]]}],["Sood further said that they have brought down the fixed cost down by almost 75-80 percent.",{"entities":[[0,4,"NAME"],[76,89,"PERCENTAGE"]]}],["The stock rose as much as 7.6 percent to the day's high of Rs 1,186.85 per share on the BSE.",{"entities":[[26,37,"PERCENTAGE"],[59,70,"PERCENTAGE"],[88,91,"COMPANY"]]}],["Meanwhile, for the September quarter, the company reported a consolidated net loss of Rs 184.06 crore versus a net profit of Rs 47.67 crore in the year ago quarter.",{"entities":[[86,101,"MONEY"],[125,139,"MONEY"]]}],["",{"entities":[]}],["Its total income was at Rs 110.61 crore during the quarter under review against Rs 979.40 crore in the corresponding quarter last fiscal. PVR's total expenses were at Rs 389.37 crore in July-September 2020-21.",{"entities":[[24,39,"MONEY"],[80,95,"MONEY"],[117,137,"TIME PERIOD"],[138,141,"COMPANY"],[167,182,"MONEY"],[186,208,"TIME PERIOD"]]}],["\" Financial performance of the company for Q2, FY 21 was impacted by the continued lockdown announced due to COVID-19 outbreak, which disrupted the company's operations, \" said PVR in a post-earnings statement.",{"entities":[[42,44,"TIME PERIOD"],[46,51,"TIME PERIOD"]]}],["PVR said its results were not comparable as business was impacted due to temporary closures of cinemas.",{"entities":[]}],["",{"entities":[]}],["Under Unlock 5.0 guidelines, the government has permitted cinemas to reopen October 15 onwards with 50 percent capacity. So far , 16 states and UTs, where PVR has a presence, have permitted cinemas to restart operations. Out of total of 831 screens of the company, over 575 have received permission to reopen, it said.",{"entities":[[76,86,"TIME PERIOD"],[100,110,"PERCENTAGE"],[154,157,"COMPANY"]]}],["PVR Ltd Chairman cum Managing Director Ajay Bijli said: \" We are eagerly waiting for the reopening of other states, specifically Maharashtra and Telangana so that business can gradually get back to normal. We are taking all possible precautions so that both our customers and employees feel safe while visiting their favourite cinema . \"",{"entities":[[8,16,"ROLE"],[21,38,"ROLE"],[39,49,"NAME"],[128,139,"ROLE"],[144,153,"ROLE"]]}]]}

In [15]:
TRAIN_DATA2['annotations']

[['The share price of PVR rose over 7 percent on Wednesday after the multiplex chain said that it has reduced losses in Q2 despite nil revenue from the core movie exhibition business.',
  {'entities': [[19, 22, 'COMPANY'],
    [33, 42, 'PERCENTAGE'],
    [46, 55, 'WEEKDAY']]}],
 ['The company managed to get rent waivers from most landlords, CFO Nitin Sood said in an interview to CNBC-TV18. “The big focus for us right now as revenues have been nil is to really reduce our fixed cost and we have managed to do that, ” he added.',
  {'entities': [[61, 64, 'ROLE'], [65, 75, 'NAME']]}],
 ['Sood further said that they have brought down the fixed cost down by almost 75-80 percent.',
  {'entities': [[0, 4, 'NAME'], [76, 89, 'PERCENTAGE']]}],
 ["The stock rose as much as 7.6 percent to the day's high of Rs 1,186.85 per share on the BSE.",
  {'entities': [[26, 37, 'PERCENTAGE'],
    [59, 70, 'PERCENTAGE'],
    [88, 91, 'COMPANY']]}],
 ['Meanwhile, for the September quarter, the company reported a 

In [25]:
for _,annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

In [16]:
for lab in TRAIN_DATA2['classes']:
    ner.add_label(lab)     #adding custom labels to the ner labels

In [18]:
disable_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']    #other pipes to disbale that we dont want to train
disable_pipes

['tagger', 'parser']

In [19]:
import random
from spacy.util import minibatch, compounding
from pathlib import Path

In [26]:
with nlp.disable_pipes(*disable_pipes):
    optimizer = nlp.resume_training()
    
    for iteration in range(100):
        random.shuffle(TRAIN_DATA)
        losses = {}
        
        batches = minibatch(TRAIN_DATA, size = compounding(1.0,4.0, 1.001))
        for batch in batches:
            text, annotation = zip(*batch)
            nlp.update(
                text,
                annotation,
                drop=0.5,
                losses=losses,
                sgd=optimizer
            )
            print("losses", losses)

losses {'ner': 6.754277605277821}
losses {'ner': 9.292915094359222}
losses {'ner': 11.854148747009214}


  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)


losses {'ner': 17.91314688841161}
losses {'ner': 22.0542842801406}
losses {'ner': 24.021386110603444}


  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)


losses {'ner': 30.88208315588726}


  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)


losses {'ner': 39.269546010272634}
losses {'ner': 46.35796491123928}
losses {'ner': 56.37222568012966}
losses {'ner': 62.66951333794633}
losses {'ner': 63.65959938443169}
losses {'ner': 65.12154739434061}


  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)


losses {'ner': 68.03408072972832}
losses {'ner': 74.40280036712227}
losses {'ner': 77.26427721272454}

  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)
  gold = GoldParse(doc, **gold)



losses {'ner': 81.1310039905382}
losses {'ner': 91.56017824160953}
losses {'ner': 97.39594792473217}
losses {'ner': 4.311568826733719}
losses {'ner': 7.484435189302758}
losses {'ner': 13.128083916448283}
losses {'ner': 18.310468906199375}
losses {'ner': 24.684628120130625}
losses {'ner': 32.77574097628677}
losses {'ner': 36.817865133280506}
losses {'ner': 40.214829706826535}
losses {'ner': 44.670802861436385}
losses {'ner': 49.79972542314269}
losses {'ner': 54.30314596344058}
losses {'ner': 60.35743494678561}
losses {'ner': 67.28128810573642}
losses {'ner': 69.48383943659982}
losses {'ner': 71.8493024090912}
losses {'ner': 77.90087471272813}
losses {'ner': 82.20095656841835}
losses {'ner': 84.99197208696792}
losses {'ner': 94.72119611317108}
losses {'ner': 6.370906876519257}
losses {'ner': 12.849798398211533}
losses {'ner': 17.24508275857488}
losses {'ner': 25.112048101961136}
losses {'ner': 32.15209941694093}
losses {'ner': 35.50701874743254}
losses {'ner': 37.6628119356742}
losses {

losses {'ner': 46.02758161571871}
losses {'ner': 48.99650575450557}
losses {'ner': 53.799139338051255}
losses {'ner': 55.51930814647029}
losses {'ner': 60.85002535247157}
losses {'ner': 67.68817362599918}
losses {'ner': 69.89209084995935}
losses {'ner': 74.19355504044542}
losses {'ner': 74.23866178363149}
losses {'ner': 5.346098181267735}
losses {'ner': 9.618432005343493}
losses {'ner': 11.943687194644212}
losses {'ner': 12.015860384527628}
losses {'ner': 13.174688599318074}
losses {'ner': 16.02383867767469}
losses {'ner': 18.592472599119105}
losses {'ner': 22.621874142860293}
losses {'ner': 28.775933880781054}
losses {'ner': 33.32484610793483}
losses {'ner': 40.13712083677831}
losses {'ner': 45.316642585574414}
losses {'ner': 50.15957608625479}
losses {'ner': 56.22651257917471}
losses {'ner': 58.4121584127862}
losses {'ner': 63.669246267670395}
losses {'ner': 68.21182390689253}
losses {'ner': 73.33660539626479}
losses {'ner': 75.14102343696621}
losses {'ner': 6.697275489568786}
losses

losses {'ner': 24.094509746384574}
losses {'ner': 29.2579471678182}
losses {'ner': 33.54107381695394}
losses {'ner': 36.97307910747804}
losses {'ner': 40.24748494917185}
losses {'ner': 42.86173449897797}
losses {'ner': 48.142635364771195}
losses {'ner': 53.8433045854199}
losses {'ner': 61.308716807268866}
losses {'ner': 63.52760269095779}
losses {'ner': 67.49590394621373}
losses {'ner': 72.3776904533006}
losses {'ner': 76.3842183539964}
losses {'ner': 79.97712556857732}
losses {'ner': 2.0986529825612052}
losses {'ner': 6.5313019621815585}
losses {'ner': 12.945864098116388}
losses {'ner': 16.72405889464683}
losses {'ner': 16.786448802441853}
losses {'ner': 22.261234100550908}
losses {'ner': 27.88243729725157}
losses {'ner': 34.23202798619543}
losses {'ner': 39.468965240862715}
losses {'ner': 42.10472045333093}
losses {'ner': 47.29244694068559}
losses {'ner': 47.29284945981941}
losses {'ner': 50.47546194600305}
losses {'ner': 54.69903381275377}
losses {'ner': 54.76103791543551}
losses {'

losses {'ner': 65.70301766081118}
losses {'ner': 2.9151393214356176}
losses {'ner': 6.080079130080321}
losses {'ner': 6.136442743754856}
losses {'ner': 7.1650132111553475}
losses {'ner': 8.516471241315081}
losses {'ner': 13.029340803487536}
losses {'ner': 17.08308684812331}
losses {'ner': 19.833154317529498}
losses {'ner': 21.762126441891688}
losses {'ner': 24.0966980960579}
losses {'ner': 29.70650883581353}
losses {'ner': 34.73221706230057}
losses {'ner': 39.10763215768002}
losses {'ner': 46.53037301572822}
losses {'ner': 48.964100926706145}
losses {'ner': 51.932828709074805}
losses {'ner': 51.93318049649657}
losses {'ner': 59.47816916967691}
losses {'ner': 63.82058465212453}
losses {'ner': 4.019549745424001}
losses {'ner': 12.820798593862264}
losses {'ner': 14.831112665333194}
losses {'ner': 19.742803064383907}
losses {'ner': 21.236488540516575}
losses {'ner': 25.396036348411258}
losses {'ner': 28.436083374650593}
losses {'ner': 32.800545969265045}
losses {'ner': 40.918466607327105}


losses {'ner': 55.68752779692631}
losses {'ner': 61.93307104157038}
losses {'ner': 66.25859787025973}
losses {'ner': 69.51186477103755}
losses {'ner': 75.23055477896258}
losses {'ner': 78.33215547948629}
losses {'ner': 82.26657757386118}
losses {'ner': 3.416253590087102}
losses {'ner': 8.879699015120671}
losses {'ner': 9.914254852293318}
losses {'ner': 12.413541023342198}
losses {'ner': 15.640010158709249}
losses {'ner': 17.537123975775632}
losses {'ner': 21.667412285569544}
losses {'ner': 24.961556606802578}
losses {'ner': 28.269572936806316}
losses {'ner': 35.15152604213678}
losses {'ner': 41.30591515904986}
losses {'ner': 46.400062754190586}
losses {'ner': 49.575651010073344}
losses {'ner': 56.52842343928627}
losses {'ner': 60.297467208973686}
losses {'ner': 64.46398331467266}
losses {'ner': 69.78261177737626}
losses {'ner': 72.0557502701446}
losses {'ner': 75.29457412275373}
losses {'ner': 0.14367373756249435}
losses {'ner': 4.468391860443262}
losses {'ner': 8.74341487379661}
losse

losses {'ner': 29.266648834236094}
losses {'ner': 31.262622740163806}
losses {'ner': 34.69424605094014}
losses {'ner': 37.292084884635415}
losses {'ner': 41.37093714826392}
losses {'ner': 45.208132987996805}
losses {'ner': 50.50370918881731}
losses {'ner': 55.2700941480382}
losses {'ner': 59.13521382460954}
losses {'ner': 63.390536252657284}
losses {'ner': 63.542534554302975}
losses {'ner': 68.40455176172541}
losses {'ner': 71.83626140145348}
losses {'ner': 77.80094159645505}
losses {'ner': 0.028948612177714494}
losses {'ner': 4.296009026134357}
losses {'ner': 10.593760482633456}
losses {'ner': 18.462809465492114}
losses {'ner': 19.840729864012303}
losses {'ner': 23.85863696194798}
losses {'ner': 29.448636160897802}
losses {'ner': 31.775024170059663}
losses {'ner': 36.8355375723841}
losses {'ner': 39.091479049056986}
losses {'ner': 42.12638518679164}
losses {'ner': 45.070637617229906}
losses {'ner': 47.8973342410075}
losses {'ner': 50.84215191220264}
losses {'ner': 53.71109382276754}
l

losses {'ner': 62.34876826343244}
losses {'ner': 67.11728591508583}
losses {'ner': 5.16470229732626}
losses {'ner': 8.385689561393194}
losses {'ner': 11.502654328177998}
losses {'ner': 11.568462691771531}
losses {'ner': 16.989285469682308}
losses {'ner': 16.989511196843985}
losses {'ner': 18.590811491820666}
losses {'ner': 22.036260248038623}
losses {'ner': 24.997283109340522}
losses {'ner': 28.603619263384196}
losses {'ner': 34.52231269969389}
losses {'ner': 36.78275363197375}
losses {'ner': 42.32327192326832}
losses {'ner': 46.34313461234618}
losses {'ner': 50.642737549524924}
losses {'ner': 56.4881827205235}
losses {'ner': 60.30251100569114}
losses {'ner': 60.30257862962171}
losses {'ner': 67.35870832360669}
losses {'ner': 4.647104382514954}
losses {'ner': 10.497787192463875}
losses {'ner': 12.613596942657182}
losses {'ner': 17.34066565139888}
losses {'ner': 20.1284707350256}
losses {'ner': 29.346226767969327}
losses {'ner': 33.937023056634715}
losses {'ner': 38.23862959594173}
loss

losses {'ner': 39.16793734691518}
losses {'ner': 45.97479350231069}
losses {'ner': 50.54634936621832}
losses {'ner': 53.602226666081236}
losses {'ner': 57.80437379227905}
losses {'ner': 63.85329116033344}
losses {'ner': 66.98790102042366}
losses {'ner': 71.68936669923241}
losses {'ner': 4.795735836029053}
losses {'ner': 10.659313774549446}
losses {'ner': 15.512659883939705}
losses {'ner': 18.25635570927807}
losses {'ner': 21.399850758670567}
losses {'ner': 23.502955467390706}
losses {'ner': 27.219596038095133}
losses {'ner': 28.890948581031935}
losses {'ner': 33.025378039302396}
losses {'ner': 35.506573213460015}
losses {'ner': 38.83248648392977}
losses {'ner': 40.67351358176838}
losses {'ner': 44.522407663923694}
losses {'ner': 48.80841099954734}
losses {'ner': 53.70919481776247}
losses {'ner': 55.842738351806766}
losses {'ner': 60.60114510149317}
losses {'ner': 66.11669441849546}
losses {'ner': 67.91412556859751}
losses {'ner': 1.5973710199905327}
losses {'ner': 6.840226078973501}
lo

In [29]:
for text, _ in TRAIN_DATA:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])

Entities []
Entities []
Entities []
Entities [('BMW', 'PRODUCT')]
Entities []
Entities [('Flipkart', 'ORG')]
Entities []
Entities []
Entities [('Max', 'ORG')]
Entities [('ShopClues', 'ORG')]
Entities []
Entities []
Entities [('Fridge', 'PRODUCT')]
Entities []
Entities [('Walmart', 'ORG')]
Entities []
Entities []
Entities [('Flipkart', 'ORG')]
Entities []
