# CUSTOM ATTRIBUTES

### imports

In [1]:
# import matcher
import spacy
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens import Token, Span
from spacy.pipeline import EntityRuler

from py2neo import Graph,Subgraph,Node,Relationship,cypher,data
from pandas import DataFrame

graph = Graph("bolt://localhost:7687", user="neo4j", password="graph")

# Matcher Example

In [2]:
# Load model, create NLP object
nlp = spacy.load("en_core_web_sm")

matcher = Matcher(nlp.vocab)

# create doc object
doc = nlp("that smart kid is using a kindle")

for token in doc:
    print(f"{token.text:<12}{token.pos_:<10}{token.dep_:<10}")

print("\n")

# identify a pattern
pattern = [{"POS": "ADJ","OP":"?"}, {"POS": "NOUN"}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("ADJ_NOUN_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)
    


that        SCONJ     det       
smart       ADJ       amod      
kid         NOUN      nsubj     
is          AUX       aux       
using       VERB      ROOT      
a           DET       det       
kindle      NOUN      dobj      


Total matches found: 3
Match found: smart kid
Match found: kid
Match found: kindle


# Custom Attribute Example

In [3]:
from spacy.tokens import Token
fruit_getter = lambda token: token.text in ("apple", "pear", "banana")
Token.set_extension("is_fruit", getter=fruit_getter,force=True)
doc = nlp("I have an apple")

for token in doc:
    print(f"{token.text:<12}{token.pos_:<10}{token.dep_:<10}{token._.is_fruit:<10}")


I           PRON      nsubj     0         
have        AUX       ROOT      0         
an          DET       det       0         
apple       NOUN      dobj      1         


# My Custom Attribute: okg_class 

### Graph class Lookup: getter function

In [4]:
def okg_class_lookup(token):
    name = str.capitalize(token.text)
    cursor = graph.run("MATCH (token:Token {name:'"+name+"'})-[r:INSTANCE_OF]->(c:Class) RETURN c.name")
    df = DataFrame(cursor)
    if(df.empty):
        return ("not found")
    else:
        classe= str(df.iloc[0,0])
        return(classe)
    

### okg_class test

In [5]:
from spacy.tokens import Token
okg_class = lambda token: okg_class_lookup(token)
Token.set_extension("okg_class", getter= okg_class,force=True)
doc = nlp("I have a missile, we have a warfare")

for token in doc:
    print(f"{token.text:<12}{token.pos_:<10}{token.dep_:<10}{token._.okg_class}")

I           PRON      nsubj     not found
have        AUX       ccomp     not found
a           DET       det       not found
missile     NOUN      dobj      Technology
,           PUNCT     punct     not found
we          PRON      nsubj     not found
have        AUX       ROOT      not found
a           DET       det       not found
warfare     NOUN      dobj      Application


# PhraseMatcher

## PhraseMatcher usage example

In [6]:
def newLine():
    print("\n")

In [7]:
# from spacy docs
from spacy.matcher import PhraseMatcher

# load nlp object
doc = nlp("Barack Obama lifts America one last time in emotional farewell")

# load matcher
matcher = PhraseMatcher(nlp.vocab)

# add entry to matcher
matcher.add("OBAMA", None, nlp("Barack Obama"))

# find matches in the document
matches = matcher(doc)
       
for match_id, start, end in matches:
    rule_id = nlp.vocab.strings[match_id]  # get the unicode ID, i.e. 'COLOR'
    span = doc[start : end]  # get the matched slice of the doc
    print(rule_id, span.text)

OBAMA Barack Obama


## Nicola's Matcher

## original

In [8]:
nlp = load_model()
matcher = PhraseMatcher(nlp.vocab)

with open('terms.txt', 'r') as f:
    terms = f.readlines()
    terms = [t.replace('\n','') for t in terms if t != '\n']

def add_phraseMatcher_ent(matcher, doc, i, matches):
    # Get the current match and create tuple of entity label, start and end.
    # Append entity to the doc's entity. (Don't overwrite doc.ents!)
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label = "SOFT SKILL")
    try: 
        doc.ents += (entity,)
    except Exception:
        pass

patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", add_phraseMatcher_ent, *patterns)


NameError: name 'load_model' is not defined

## Modified

In [9]:
# import matcher
import spacy
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens import Token, Span
from spacy.pipeline import EntityRuler

from py2neo import Graph,Subgraph,Node,Relationship,cypher,data
from pandas import DataFrame

graph = Graph("bolt://localhost:7687", user="neo4j", password="graph")

# Load model, create NLP object
nlp = spacy.load("en_core_web_sm")

matcher = PhraseMatcher(nlp.vocab)

In [10]:
# getter function
def add_phraseMatcher_ent(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label = "SOFT SKILL") # da cambiare
    try: 
        doc.ents += (entity,)
    except Exception:
        pass


## Phrase Matcher

In [11]:
# support data structures
cursor = graph.run("match (class:Class) return class.name ")
df = DataFrame(cursor)
classList = list(df[0])

# iterate through classes
for tokenClass in classList:
    cursor = graph.run("match (t:Token)-[:INSTANCE_OF]->(c:Class {name:'"+tokenClass+"'}) return t.name")
    df = DataFrame(cursor)
    tokenList = list(df[0])
    
    # add matcher rule for each token in the class
    patterns = [nlp.make_doc(text) for text in tokenList]
    matcher.add(tokenClass, add_phraseMatcher_ent, *patterns)
    
    # add lowerCase
    patterns = [nlp.make_doc(str.lower(text)) for text in tokenList]
    matcher.add(tokenClass, add_phraseMatcher_ent, *patterns)
    

In [12]:
# load nlp object & matcher
doc = nlp("I have a Missile, there is a warfare in New York, last night I heard about allocation of decision rights")

matches = matcher(doc)

for match_id, start, end in matches:
    rule_id = nlp.vocab.strings[match_id]
    span = doc[start : end]
    print(f"{span.text:<33}{rule_id:<10}")

Missile                          Technology
warfare                          Application
allocation of decision rights    Approach  
