In [4]:
import re

We will see how to use PhraseMatcher component to build domain specific Product Entity Recognition and also add custom attributes to our entity.

In [42]:
#The below list of products are obtained from the Red Hat Product Life
#Cycle API
# curl -vk "https://access.redhat.com/labs/plccapi/lifecycle.json" | jq '.[].name'

In [19]:
text = '''"Red Hat Enterprise Linux"
"Red Hat OpenStack Platform"
"Openshift Container Platform"
"Red Hat CloudForms"
"Red Hat Certificate System"
"Red Hat Directory Server"
"Red Hat Developer Toolset"
"Red Hat Virtualization"
"Red Hat Satellite Server"
"Red Hat JBoss Enterprise Application Platform"
"Red Hat JBoss Web Server"
"Red Hat Developer Studio"
"Red Hat JBoss Fuse Service Works"
"Red Hat JBoss Data Virtualization"
"Red Hat Process Automation Manager"
"Red Hat Decision Manager"
"Red Hat JBoss Operations Network"
"Red Hat Data Grid"
"Red Hat Fuse"
"Red Hat AMQ"
"Red Hat Gluster Storage"
"Red Hat Ceph Storage"
"Red Hat 3scale API Management Platform"
"Red Hat Software Collections"
"Red Hat Container Development Kit"
"Red Hat Single Sign-On"
".Net Core"
"Apache HTTP Server"
"Ansible Tower"
"OpenJDK"
"Red Hat CodeReady Workspaces"'''

In [31]:
text = re.sub("\"", " ", text)
raw_products = text.split('\n')

In [32]:
PRODUCTS = [product.strip() for product in raw_products];products_final

['Red Hat Enterprise Linux',
 'Red Hat OpenStack Platform',
 'Openshift Container Platform',
 'Red Hat CloudForms',
 'Red Hat Certificate System',
 'Red Hat Directory Server',
 'Red Hat Developer Toolset',
 'Red Hat Virtualization',
 'Red Hat Satellite Server',
 'Red Hat JBoss Enterprise Application Platform',
 'Red Hat JBoss Web Server',
 'Red Hat Developer Studio',
 'Red Hat JBoss Fuse Service Works',
 'Red Hat JBoss Data Virtualization',
 'Red Hat Process Automation Manager',
 'Red Hat Decision Manager',
 'Red Hat JBoss Operations Network',
 'Red Hat Data Grid',
 'Red Hat Fuse',
 'Red Hat AMQ',
 'Red Hat Gluster Storage',
 'Red Hat Ceph Storage',
 'Red Hat 3scale API Management Platform',
 'Red Hat Software Collections',
 'Red Hat Container Development Kit',
 'Red Hat Single Sign-On',
 '.Net Core',
 'Apache HTTP Server',
 'Ansible Tower',
 'OpenJDK',
 'Red Hat CodeReady Workspaces']

In [45]:
CANONICALS = {
    'Red Hat Enterprise Linux': 'rhel',
    'Red Hat OpenStack Platform': 'openstack',
    'Openshift Container Platform': 'openshift',
    'Red Hat CloudForms': 'cloudforms',
    'Red Hat Certificate System': 'certificate',
    'Red Hat Directory Server': 'directory_server',
    'Red Hat Developer Toolset': 'developer_toolset',
    'Red Hat Virtualization': 'rhv',
    'Red Hat Satellite Server': 'satallite',
    'Red Hat JBoss Enterprise Application Platform': 'eap',
    'Red Hat JBoss Web Server': 'jboss_web_server',
    'Red Hat Developer Studio': 'rhds',
    'Red Hat JBoss Fuse Service Works': 'fuse_service',
    'Red Hat JBoss Data Virtualization': 'datavirt',
    'Red Hat Process Automation Manager': 'pam',
    'Red Hat Decision Manager': 'rhdm',
    'Red Hat JBoss Operations Network': 'jon',
    'Red Hat Data Grid': 'datagrid',
    'Red Hat Fuse': 'fuse',
    'Red Hat AMQ': 'amq',
    'Red Hat Gluster Storage': 'gluster',
    'Red Hat Ceph Storage': 'ceph',
    'Red Hat 3scale API Management Platform': '3scale',
    'Red Hat Software Collections': 'rhscl',
    'Red Hat Single Sign-On': 'sso',
    '.Net Core': '.net',
    'Apache HTTP Server': 'apache_http',
    'Ansible Tower': 'tower',
    'OpenJDK': 'openjdk',
    'Red Hat CodeReady Workspaces': 'codeready'}

In [27]:
import spacy
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

In [28]:
nlp = spacy.load('en_core_web_lg')

In [46]:
def products_component(doc):
    matcher = PhraseMatcher(nlp.vocab)
    patterns = list(nlp.pipe(PRODUCTS))
    matcher.add('PRODUCT', None, *patterns)
    # Create an entity Span with the label 'PRODUCT' for all matches
    doc.ents = [Span(doc, start, end, label='PRODUCT') 
                 for match_id, start, end in matcher(doc)]
    return doc

#Getter that looks up the span text in the dictionary of product canonical
get_canonical = lambda span: CANONICALS.get(span.text)

#Register the Span extension attribute 'canonical' with the getter get_canonical
Span.set_extension('canonical', getter=get_canonical, force=True)

doc = nlp("Red Hat Enterprise Linux is the flagship product of Red Hat Inc"
         )

print([(ent.text, ent.label_, ent._.canonical)for ent in doc.ents])

[('Red Hat Enterprise Linux', 'ORG', 'rhel'), ('Red Hat Inc', 'ORG', None)]
