## Phrase matcher with Entity Ruler

In [3]:
# imports
import spacy
from spacy import displacy
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens import Token, Span
from spacy.pipeline import EntityRuler
from spacy.kb import KnowledgeBase
from py2neo import Graph,Subgraph,Node,Relationship,cypher,data
import pandas
from pandas import DataFrame
import numpy as np

# Graph load
graph = Graph("bolt://localhost:7687", user="neo4j", password="graph")

# Model load, NLP object init, vocab init
nlp = spacy.load("en_core_web_sm")
vocab = nlp.vocab

# Phrase matcher init
matcher = PhraseMatcher(nlp.vocab,attr="LOWER")

## Phrase Matcher Creation

In [4]:
from spacy.pipeline import EntityRuler

# entity ruler creation
nlp = spacy.load("en_core_web_sm")
ruler = EntityRuler(nlp)

# match all classes
cursor = graph.run("match (class:Class) return class.name ")
df = DataFrame(cursor)

# creates a list of class nodes
classList = list(df[0])
patterns = []

In [5]:
# Creating a list of Tokens
for tokenClass in classList:
    cursor = graph.run("match (t:Token)-[:INSTANCE_OF]->(c:Class {name:'"+tokenClass+"'}) return t.name")
    df = DataFrame(cursor)
    tokenList = list(df[0])
    tokenList = [t.lower() for t in tokenList]

    patterns.append({"label":tokenClass,"pattern":[{"LOWER":{"IN":tokenList}}]})
    
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)

## Functions

### EntFinder

### EntFinder: Finds a missing entity in the graph given a sentence with a known entity and a known relationship

In [12]:
def entFinder(doc, entity):
    
    root1 = entity.root
    
    deps = dict()
    deps.update({root1.dep_:root1})
    
    for root2 in doc:
        deps.update({root2.dep_:root2})
        
    nsubj = deps.get("nsubj")
    dobj = deps.get("dobj")
    root = deps.get("ROOT")
        
    if(nsubj != None and dobj != None and root != None):
        print(nsubj, root, dobj)
        print("\n")

        cursor = graph.run("match(s:Class {name:'"+entity.label_+
                  "'})<-[r:"+str.upper(root.lemma_).replace(" ","_")+
                  "]->(e) return e.name")
        
        if(cursor.next!=None):
            df = DataFrame(cursor)
            print(f"{entity.text:<10}{entity.label_:<10}")
            print(f"{dobj.text:<10}{df.iloc[0,0]}")
            print("\n")

### Test

In [59]:
doc = nlp("the malware is using SSN stolen from new york citizen")
displacy.render(doc,style="ent")

In [58]:
#for ent in doc.ents:
 #   print (ent.label_)
    
entFinder(doc, doc.ents[0])

malware using ssn


malware   Technology
ssn       Information


