<p style="font-family:Roboto; font-size: 28px; color: magenta"> Python for NLP: Vocabulary and Phrase Matching with SpaCy</p>

In [18]:
'''
  
'''

'\n  \n'

<p style="font-family:Roboto; font-size: 22px; color: orange; text-decoration-line: overline; "> Part: _Rule-Based Matching</p>

In [19]:
import spacy
nlp = spacy.load('en_core_web_sm')

<p style="font-family:Roboto; font-size: 22px; color: orange; text-decoration-line: overline; "> Part: _Creating Matcher Object</p>

In [20]:
from spacy.matcher import Matcher
m_tool = Matcher(nlp.vocab)

<p style="font-family:Roboto; font-size: 22px; color: orange; text-decoration-line: overline; "> Part: _Defining Patterns</p>

In [21]:
p1 = [{'LOWER': 'quickbrownfox'}] # The token attribute LOWER defines that the phrase should be converted into lower case before matching.
p2 = [{'LOWER': 'quick'}, {'IS_PUNCT': True}, {'LOWER': 'brown'}, {'IS_PUNCT': True}, {'LOWER': 'fox'}]
p3 = [{'LOWER': 'quick'}, {'LOWER': 'brown'}, {'LOWER': 'fox'}]
p4 =  [{'LOWER': 'quick'}, {'LOWER': 'brownfox'}]

In [22]:
m_tool.add('NAME', [p1, p2, p3, p4])

<p style="font-family:Roboto; font-size: 22px; color: orange; text-decoration-line: overline; "> Part: _Applying Matcher to the Document</p>

In [23]:
sentence = nlp(u'The quick-brown-fox jumps over the lazy dog. The quick brown fox eats well. \
               the quickbrownfox is dead. the dog misses the quick brownfox')

In [24]:
'''
The result will be all the ids of the phrases matched in the document, 
along with their starting and ending positions in the document.
'''
phrase_matches = m_tool(sentence)
print(phrase_matches )

[(424143666773229379, 1, 6), (424143666773229379, 13, 16), (424143666773229379, 21, 22), (424143666773229379, 29, 31)]


In [25]:
'''To actually view the result in a better way'''
for match_id, start, end in phrase_matches:
    string_id = nlp.vocab.strings[match_id]  
    span = sentence[start:end]                   
    print(match_id, string_id, start, end, span.text)

424143666773229379 NAME 1 6 quick-brown-fox
424143666773229379 NAME 13 16 quick brown fox
424143666773229379 NAME 21 22 quickbrownfox
424143666773229379 NAME 29 31 quick brownfox


<p style="font-family:Roboto; font-size: 22px; color: orange; text-decoration-line: overline; "> Part: _More Options for Rule-Based Matching</p>

In [26]:
m_tool.remove('NAME')

In [27]:
p1 = [{'LOWER': 'quick'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'brown'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'fox'}]
m_tool.add('NAME_OF_PATTERN', [p1])

In [28]:
sentence = nlp(u'The quick--brown--fox jumps over the  quick-brown---fox')

In [29]:
phrase_matches = m_tool(sentence)

for match_id, start, end in phrase_matches:
    string_id = nlp.vocab.strings[match_id]  
    span = sentence[start:end]                   
    print(match_id, string_id, start, end, span.text)

11675916215132512264 NAME_OF_PATTERN 1 6 quick--brown--fox
11675916215132512264 NAME_OF_PATTERN 10 15 quick-brown---fox


<p style="font-family:Roboto; font-size: 26px; color: magenta; text-decoration-line: overline; "> _Phrase-Based Matching</p>

In [30]:
'''
let's first parse the Wikipedia article that we will be using to perform phrase matching
'''
import bs4 as bs  
import urllib.request  
import re  
import nltk

scrapped_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/Artificial_intelligence')  
article = scrapped_data .read()

parsed_article = bs.BeautifulSoup(article,'lxml')

paragraphs = parsed_article.find_all('p')

article_text = ""

for p in paragraphs:  
    article_text += p.text
    
    
processed_article = article_text.lower()  
processed_article = re.sub('[^a-zA-Z]', ' ', processed_article )  
processed_article = re.sub(r'\s+', ' ', processed_article)


<p style="font-family:Roboto; font-size: 22px; color: orange; text-decoration-line: overline; "> Part: _Create Phrase Matcher Object</p>

In [31]:
from spacy.matcher import PhraseMatcher
phrase_matcher = PhraseMatcher(nlp.vocab)

<p style="font-family:Roboto; font-size: 22px; color: orange; text-decoration-line: overline; "> Part: _Create Phrase List</p>

In [32]:
phrases = ['machine learning', 'robots', 'intelligent agents']

patterns = [nlp(text) for text in phrases]

In [34]:
'''Here the name of our matcher is AI.'''
phrase_matcher.add('AI', None, *patterns)

<p style="font-family:Roboto; font-size: 22px; color: orange; text-decoration-line: overline; "> Part: _Applying Matcher to the Document</p>

In [36]:
sentence = nlp (processed_article)

matched_phrases = phrase_matcher(sentence)
matched_phrases

[(5530044837203964789, 1150, 1152),
 (5530044837203964789, 1183, 1185),
 (5530044837203964789, 1295, 1297),
 (5530044837203964789, 2555, 2557),
 (5530044837203964789, 3264, 3266),
 (5530044837203964789, 3348, 3350),
 (5530044837203964789, 3600, 3602),
 (5530044837203964789, 4642, 4644),
 (5530044837203964789, 4892, 4893),
 (5530044837203964789, 5111, 5113),
 (5530044837203964789, 5440, 5441),
 (5530044837203964789, 6604, 6606),
 (5530044837203964789, 6975, 6977),
 (5530044837203964789, 7013, 7015),
 (5530044837203964789, 7045, 7047),
 (5530044837203964789, 7378, 7380),
 (5530044837203964789, 7436, 7438),
 (5530044837203964789, 7854, 7855),
 (5530044837203964789, 7878, 7880),
 (5530044837203964789, 7987, 7989),
 (5530044837203964789, 8065, 8066),
 (5530044837203964789, 10579, 10581),
 (5530044837203964789, 10606, 10608),
 (5530044837203964789, 11273, 11275),
 (5530044837203964789, 11300, 11302),
 (5530044837203964789, 12353, 12354),
 (5530044837203964789, 12694, 12695)]

In [37]:
'''To see the string value of the matched phrases'''
for match_id, start, end in matched_phrases:
    string_id = nlp.vocab.strings[match_id]  
    span = sentence[start:end]                   
    print(match_id, string_id, start, end, span.text)

5530044837203964789 AI 1150 1152 machine learning
5530044837203964789 AI 1183 1185 machine learning
5530044837203964789 AI 1295 1297 machine learning
5530044837203964789 AI 2555 2557 machine learning
5530044837203964789 AI 3264 3266 machine learning
5530044837203964789 AI 3348 3350 machine learning
5530044837203964789 AI 3600 3602 machine learning
5530044837203964789 AI 4642 4644 machine learning
5530044837203964789 AI 4892 4893 robots
5530044837203964789 AI 5111 5113 machine learning
5530044837203964789 AI 5440 5441 robots
5530044837203964789 AI 6604 6606 machine learning
5530044837203964789 AI 6975 6977 machine learning
5530044837203964789 AI 7013 7015 machine learning
5530044837203964789 AI 7045 7047 machine learning
5530044837203964789 AI 7378 7380 machine learning
5530044837203964789 AI 7436 7438 machine learning
5530044837203964789 AI 7854 7855 robots
5530044837203964789 AI 7878 7880 machine learning
5530044837203964789 AI 7987 7989 machine learning
5530044837203964789 AI 8065 80

<p style="font-family:Roboto; font-size: 26px; color: magenta; text-decoration-line: overline; "> _Stop Words</p>

In [39]:
'''Stop words are English words such as "the", "a", "an" etc that do not have any meaning of their own
The Spacy library has a default list of 326 stopwords'''
sp = spacy.load('en_core_web_sm')
print(sp.Defaults.stop_words)

{'for', 'neither', 'rather', 'you', 'whereby', 'her', 'out', 'regarding', 'after', 'under', 'fifty', 'always', 'whereas', 'them', 'both', 'well', 'or', 'ca', 'please', 'except', 'empty', 'below', 'my', 'seemed', 'these', 'together', 'have', 'now', 'alone', 'name', 'whence', 'am', 'before', 'here', 'hence', 'whoever', 'within', 'forty', 'off', 'he', 'everywhere', 'becoming', 'take', 'although', 'each', 'see', 'noone', 'n‘t', "'m", 'six', 'but', 'we', 'nothing', "'re", 'to', 'throughout', 'which', 'done', 'put', 'back', 'amongst', '’re', 'did', 'than', 'hereafter', 'there', '‘re', 'a', 'call', 'by', 'other', 'yet', 'afterwards', 'about', "'d", 'became', 'all', 'when', 'anyway', 'was', 'above', 'then', 'does', 'twelve', 'of', 'also', 'nine', 'through', 'quite', 'last', 'it', 'others', 'onto', 'whither', 'twenty', 'something', 'few', 'full', 'unless', 'former', 'much', 'nevertheless', 'else', 'third', 'what', 'where', '’ve', 'among', 'somehow', 'seems', 'may', 'how', 'why', 'do', 'are', 's

In [41]:
'''Since "wonder" is not a spaCy stop word, you will see False in the output.'''
sp.vocab['wonder'].is_stop

False

In [42]:
'''To add or remove stop words in spaCy, 
you can use sp.Defaults.stop_words.add() and sp.Defaults.stop_words.remove() methods respectively'''
sp.Defaults.stop_words.add('wonder')

In [48]:
sp.vocab['wonder'].is_stop

True