In [1]:
import spacy
from spacy import displacy
import random

In [2]:
# nlp = spacy.load('en')
nlp = spacy.load('en_core_web_lg')
import warnings; warnings.simplefilter('ignore')

In [3]:
len(nlp.vocab)

1340242

In [4]:
doc = nlp("Let's trade an interest rate swap")
for token in doc:
    print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}".format(
        token.text,
        token.idx,
        token.lemma_,
        token.is_punct,
        token.is_space,
        token.shape_,
        token.pos_,
        token.tag_
    ))

Let	0	let	False	False	Xxx	VERB	VB
's	3	-PRON-	False	False	'x	PRON	PRP
trade	6	trade	False	False	xxxx	VERB	VB
an	12	an	False	False	xx	DET	DT
interest	15	interest	False	False	xxxx	NOUN	NN
rate	24	rate	False	False	xxxx	NOUN	NN
swap	29	swap	False	False	xxxx	NOUN	NN


In [5]:
from spacy import displacy 
doc = nlp('I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ. WSJ means the Wall Street Journal')
displacy.render(doc, style='ent', jupyter=True)

In [20]:
a = nlp.get_pipe('ner')

In [21]:
custom_entities = ['FIN_PRODUCT', 'SPREAD', 'ASSET CLASS', 'RFQ', 'TRADE_EXECUTION']
for ent in custom_entities:
    if 'extra_labels' in a.cfg and ent in a.cfg['extra_labels']:
        pass
    else:
        a.add_label(ent)

In [8]:
a.cfg['extra_labels'] = custom_entities
a.cfg

{'beam_width': 1,
 'beam_density': 0.0,
 'beam_update_prob': 1.0,
 'cnn_maxout_pieces': 3,
 'deprecation_fixes': {'vectors_name': 'en_model.vectors'},
 'nr_class': 93,
 'hidden_depth': 1,
 'token_vector_width': 96,
 'hidden_width': 64,
 'maxout_pieces': 2,
 'pretrained_vectors': 'en_model.vectors',
 'bilstm_depth': 0,
 'extra_labels': ['FIN_PRODUCT',
  'SPREAD',
  'ASSET CLASS',
  'RFQ',
  'TRADE_EXECUTION']}

In [9]:
def find_label(text, labels = {}):
    l = []
    for label in labels:
        if label not in text:
            print(f"{label} not found in '{text}'")
        else:
            idx = text.index(label)
            l.append((idx, idx + len(label), labels[label]))
            
    return (text, {'entities': l})

In [10]:
TRAIN_DATA = [
    find_label("We price the swap at 54 bips", {"We price": "RFQ", "swap":"FIN_PRODUCT", "54 bips":"CARDINAL"}),
    find_label("TD offers 1.123 on 10 million of CADUSD", {"offers": "RFQ", "swap":"CADUSD", "1.123":"CARDINAL", '10 million':"CARDINAL", "CADUSD":"FIN_PRODUCT"})
]
TRAIN_DATA

swap not found in 'TD offers 1.123 on 10 million of CADUSD'


[('We price the swap at 54 bips',
  {'entities': [(0, 8, 'RFQ'),
    (13, 17, 'FIN_PRODUCT'),
    (21, 28, 'CARDINAL')]}),
 ('TD offers 1.123 on 10 million of CADUSD',
  {'entities': [(3, 9, 'RFQ'),
    (10, 15, 'CARDINAL'),
    (19, 29, 'CARDINAL'),
    (33, 39, 'FIN_PRODUCT')]})]

In [11]:
# Add entity recognizer to model if it's not in the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
# otherwise, get it, so we can add labels to it
else:
    ner = nlp.get_pipe('ner')

#optimizer = nlp.begin_training()
optimizer = nlp.entity.create_optimizer()
#Use create optimizer to avoid wipping old data

In [12]:
TRAIN_DATA

[('We price the swap at 54 bips',
  {'entities': [(0, 8, 'RFQ'),
    (13, 17, 'FIN_PRODUCT'),
    (21, 28, 'CARDINAL')]}),
 ('TD offers 1.123 on 10 million of CADUSD',
  {'entities': [(3, 9, 'RFQ'),
    (10, 15, 'CARDINAL'),
    (19, 29, 'CARDINAL'),
    (33, 39, 'FIN_PRODUCT')]})]

In [13]:
from spacy.util import minibatch
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    for itn in range(20):
        random.shuffle(TRAIN_DATA)
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(TRAIN_DATA) #, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
        print('Losses', losses)

Losses {'ner': 19.73893654346466}
Losses {'ner': 18.226775765419006}
Losses {'ner': 16.824986525542045}
Losses {'ner': 11.79352217912674}
Losses {'ner': 14.629473149776459}
Losses {'ner': 16.670366644859314}
Losses {'ner': 14.710826933383942}
Losses {'ner': 16.95714807510376}
Losses {'ner': 18.974334239959717}
Losses {'ner': 11.14975337049691}
Losses {'ner': 10.596344947814941}
Losses {'ner': 16.879505856893957}
Losses {'ner': 14.769561789111322}
Losses {'ner': 21.876827001571655}
Losses {'ner': 18.80102777481079}
Losses {'ner': 13.055740110576153}
Losses {'ner': 10.074358435347676}
Losses {'ner': 16.169899128377438}
Losses {'ner': 11.406005442142487}
Losses {'ner': 13.716003030538559}


In [14]:
displacy.render(nlp("We price the USDCAD at 54 bips"), style='ent', jupyter=True)

In [15]:
doc = nlp("We price the USDCAD at 54 bips. Do you want to trade that?")

displacy.render(doc, style='ent', jupyter=True)

In [16]:
doc.ents[0].label_

'QUANTITY'

In [23]:
#Example of overriding ents explicitly
from spacy.tokens import Span
label_hash = doc.vocab.strings["RFQ"]
doc.ents = (doc.ents[0], doc.ents[1], doc.ents[2], Span(doc, 8, 13,label_hash))

IndexError: tuple index out of range

In [24]:
displacy.render(doc, style='ent', jupyter=True)

In [None]:
# list(nlp.vocab.strings).index("CURRENCY")

In [25]:
doc.vocab.strings["RFQ"]

18207045277650155119

In [26]:
class CurrencyPairPipeline(object):
    def __init__(self, nlp):
        self.label_hash = nlp.vocab.strings["CURRENCY"]
        self.regex_ = r"[USD|EUR|GBP|JPY|CAD][A-Z|a-z]{3}|[A-Z|a-z]{3}[USD|EUR|GBP|JPY|CAD]"
 
    def __call__(self, doc):
        import re
        new_tokens = doc.ents
        for idx, token in enumerate(doc):
            #Checking if a currency pair, e.g. USDEUR, EURUSD, etc    
            if re.search(self.regex_, token.text):
                #We found a match so need to update the entities
                span = Span(doc, idx, idx+1, label=self.label_hash)
                #Spacy only supports one label per phrase, so need to conditionally replace (e.g. USDEUR may be wrongly labelled as an ORG or GPE)
                new_tokens = tuple([t for t in new_tokens if t.start != idx]) + (span,)
        doc.ents = new_tokens
        return doc

In [27]:
from spacy.tokens import Token
 
ccy_pipeline = CurrencyPairPipeline(nlp)
try:
    nlp.remove_pipe(name='ccy_pipeline')
except:
    print("Couldn't remove pipe")
nlp.add_pipe(ccy_pipeline, name='ccy_pipeline')


Couldn't remove pipe


In [32]:
doc = nlp("Paris is the awesome capital of France. They use the euro. The current USDEUR rate is 1.112 to exchange currencies")
displacy.render(doc, style='ent', jupyter=True)

In [29]:
doc = nlp("EURUSD is more stable than USDGBP at the moment")
displacy.render(doc, style='ent', jupyter=True)

In [30]:
doc = nlp("1.123")
displacy.render(doc, style='ent', jupyter=True)

In [31]:
doc = nlp("We price EURUSD at 1.124. Do you want to trade it?")
displacy.render(doc, style='ent', jupyter=True)