In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
from spacy.matcher import Matcher

In [4]:
matcher = Matcher(nlp.vocab)

In [5]:
# SolarPower
pattern1 = [{'LOWER': 'solarpower'}]
# Solar-power
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]
# Solar power
pattern3 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]

In [6]:
matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

In [7]:
doc = nlp(u'The Solar Power industry continues to grow as solarpower increases. Solar-power is amazing!')

In [8]:
matches = matcher(doc)

In [9]:
print(matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 9), (8656102463236116519, 11, 14)]


In [10]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id] # get string representation
    span = doc[start:end] # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 8 9 solarpower
8656102463236116519 SolarPower 11 14 Solar-power


In [11]:
# Remove patterns
matcher.remove('SolarPower')

In [12]:
# Solarpower
pattern4 = [{'LOWER': 'solarpower'}]
# Solar.power, solar-power, solar--power, solar*power, solar_power and etc
pattern5 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP': '*'}, {'LOWER': 'power'}]

In [13]:
matcher.add('SolarPower', None, pattern4, pattern5)

In [14]:
doc2 = nlp(u'Solar--power is solarpower yay!')

In [15]:
matches = matcher(doc2)

In [16]:
print(matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 4, 5)]


In [17]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id] # get string representation
    span = doc2[start:end] # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 0 3 Solar--power
8656102463236116519 SolarPower 4 5 solarpower
