In [2]:
import spacy
import re
from spacy.lang.en import English 
from spacy.tokens import Doc

In [28]:
nlp = spacy.load("en_core_web_sm")
#doc = nlp("Uridine 5' - diphosphate ( UDP ) and uridINe 5' - dipHOSphaTe - glucose dehydrogenase ( UGD ) produces UDP - glucuronic acid from UDP - glucose as a precursor of plant cell wall polysaccharides.")
doc = nlp("Uridine 5' - diphosphate and other things.")

#expression = r"[Uu](nited|\\.?) ?[Ss](tates|\\.?)"
expression = r"[Uu]ridine 5'\s*\-\s*diphosphate"
for match in re.finditer(expression, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)
    # This is a Span object or None if match doesn't map to valid token sequence
    if span is not None:
        print("Found match:", span.text)

Found match: Uridine 5' - diphosphate


In [29]:
match.span()

(0, 24)

In [30]:
span

Uridine 5' - diphosphate

In [33]:
span.text

"Uridine 5' - diphosphate"

In [35]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Uridine 5' - diphosphate")
for token in doc:
    print(token.text)

Uridine
5
'
-
diphosphate


In [49]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("4 - Amino - 1 - (2R,4S,5R) - 4 - hydroxy-5- (hydroxymethyl)tetrahydrofuran-2-ylpyrimidin-2(1H)-one")
for token in doc:
    print(token.text)

4
-
Amino
-
1
-
(
2R,4S,5R
)
-
4
-
hydroxy-5-
(
hydroxymethyl)tetrahydrofuran-2
-
ylpyrimidin-2(1H)-one


In [5]:
nlp = English()
ruler = nlp.add_pipe("entity_ruler") #can only work for non-token pattern

metabolites_patterns = [
  #{"label": "test", "pattern": "United Kingdom", "id": "loc_test1"},
  #{"label": "test", "pattern": {"FUZZY": "nited"}, "id": "loc_test1-2"},
  #{"label": "test", "pattern": {"REGEX": r"ited"}, "id": "loc_test2"},
  #{"label": "Metabolites", "pattern": "Uridine 5'-diphosphate", "id": "Uridine 5'-diphosphate_nospace_detection"},
  #{"label": "Metabolites", "pattern": "Uridine 5' - diphosphate", "id": "Uridine 5'-diphosphate"},
  #{"label": "Metabolites", "pattern": {"REGEX": r"uridine"}, "id": "Uridine 5'-diphosphate phrase"},
  #{"label": "Metabolites", "pattern": [{"TEXT": {"REGEX": "uridine 5'\s*-\s*diphosphate"}}], "id": "Uridine 5'-diphosphate1"},
  #{"label": "Metabolites", "pattern": [{"TEXT": {"REGEX": r"[Uu]ridine"}}, {"LOWER": {"REGEX": r"5"}}, {"LOWER": {"REGEX": r"\'"}}, {"LOWER": {"REGEX": r"\-"}}, {"LOWER": {"REGEX": r"diphosphate"}}], "id": "Uridine 5'-diphosphate1"},
  {"label": "Metabolites", "pattern": [{"LOWER": "uridine"}, {"LOWER": "5"}, {"LOWER": "'"}, {"IS_PUNCT": True, "OP": "*"}, {"LOWER": "diphosphate"}], "id": "Uridine 5'-diphosphate2"},
  {"label": "Metabolites", "pattern": [{"IS_PUNCT": True, "OP": "?"}, {"LOWER": "diphosphate"}], "id": "-diphosphate small"},
  {"label": "Metabolites", "pattern": "just add) Uridine 5' - diphosphate", "id": "Uridine 5'-diphosphate"},
]
ruler.add_patterns(metabolites_patterns)

#build abstracts
abstracts = [
			"London is the United Kingdom capital of the United Kingdom.",
            "- dipHOsphate Uridine 5' - - diphosphate Uridine 5' - , . , diphosphate  ( UDP ) and uridINe 5' - dipHOSphaTe - glucose dehydrogenase ( UGD ) produces UDP - glucuronic acid from UDP - glucose as a precursor of plant cell wall polysaccharides.",
          	"In the presence of inorganic phosphate, uridine 5'-diphosphate glucose (UDPG) is specifically hydrolyzed to glucose 1-phosphate and UDP by a unique enzyme, UDPG phosphorylase."
         ]

docs = list(nlp.pipe(abstracts))
c_doc = Doc.from_docs(docs)
doc = nlp(c_doc)

#print NER result
# print([(span.text, span.label_) for span in doc.spans["ruler"]]) # for spans
print([(ent.text, ent.label_, ent.ent_id_, ent.start_char, ent.end_char) for ent in doc.ents])

[('- dipHOsphate', 'Metabolites', '-diphosphate small', 60, 73), ("Uridine 5' - - diphosphate", 'Metabolites', "Uridine 5'-diphosphate2", 74, 100), ("Uridine 5' - , . , diphosphate", 'Metabolites', "Uridine 5'-diphosphate2", 101, 131), ("uridINe 5' - dipHOSphaTe", 'Metabolites', "Uridine 5'-diphosphate2", 145, 169)]


In [14]:
nlp = English()
ruler = nlp.add_pipe("entity_ruler", config={"phrase_matcher_attr": "LOWER"}) #can only work for non-token pattern

metabolites_patterns = [
  #{"label": "test", "pattern": "United Kingdom", "id": "loc_test1"},
  #{"label": "test", "pattern": {"FUZZY": "nited"}, "id": "loc_test1-2"},
  #{"label": "test", "pattern": {"REGEX": r"ited"}, "id": "loc_test2"},
  #{"label": "Metabolites", "pattern": "Uridine 5'-diphosphate", "id": "Uridine 5'-diphosphate_nospace_detection"},
  #{"label": "Metabolites", "pattern": "Uridine 5' - diphosphate", "id": "Uridine 5'-diphosphate"},
  #{"label": "Metabolites", "pattern": {"REGEX": r"uridine"}, "id": "Uridine 5'-diphosphate phrase"},
  #{"label": "Metabolites", "pattern": [{"TEXT": {"REGEX": "uridine 5'\s*-\s*diphosphate"}}], "id": "Uridine 5'-diphosphate1"},
  #{"label": "Metabolites", "pattern": [{"TEXT": {"REGEX": r"[Uu]ridine"}}, {"LOWER": {"REGEX": r"5"}}, {"LOWER": {"REGEX": r"\'"}}, {"LOWER": {"REGEX": r"\-"}}, {"LOWER": {"REGEX": r"diphosphate"}}], "id": "Uridine 5'-diphosphate1"},
  {"label": "Metabolites", "pattern": [{"LOWER": "uridine"}, {"LOWER": "5"}, {"LOWER": "'"}, {"LOWER": "-"}, {"LOWER": "diphosphate"}], "id": "Uridine 5'-diphosphate2"}
  #{"label": "Metabolites", "pattern": "just add) Uridine 5' - diphosphate", "id": "Uridine 5'-diphosphate"}
]
ruler.add_patterns(metabolites_patterns)

#build abstracts
abstracts = [
			"London is the United Kingdom capital of the United Kingdom.",
            "Uridine 5' - diphosphate ( UDP ) and uridINe 5' - dipHOSphaTe - glucose dehydrogenase ( UGD ) produces UDP - glucuronic acid from UDP - glucose as a precursor of plant cell wall polysaccharides.",
          	"In the presence of inorganic phosphate, uridine 5'-diphosphate glucose (UDPG) is specifically hydrolyzed to glucose 1-phosphate and UDP by a unique enzyme, UDPG phosphorylase."
         ]

docs = list(nlp.pipe(abstracts))
c_doc = Doc.from_docs(docs)
doc = nlp(c_doc)

#print NER result
# print([(span.text, span.label_) for span in doc.spans["ruler"]]) # for spans
print([(ent.text, ent.label_, ent.ent_id_, ent.start_char, ent.end_char) for ent in doc.ents])

[("Uridine 5' - diphosphate", 'Metabolites', "Uridine 5'-diphosphate2", 60, 84), ("uridINe 5' - dipHOSphaTe", 'Metabolites', "Uridine 5'-diphosphate2", 97, 121)]


In [51]:
import spacy
from spaczz.matcher import FuzzyMatcher

nlp = spacy.blank("en")
text = """Grint Anderson created spaczz in his home at 555 Fake St,
Apt 5 in Nashv1le, TN 55555-1234 in the US."""  # Spelling errors intentional.
doc = nlp(text)

matcher = FuzzyMatcher(nlp.vocab)
matcher.add("NAME", [nlp("Grant Andersen")])
matcher.add("GPE", [nlp("Nashville")])
matches = matcher(doc)

for match_id, start, end, ratio in matches:
    print(match_id, doc[start:end], ratio)

NAME Grint Anderson 86
GPE Nashv1le 82
