In [1]:
import re
import math
import string
import nltk
from tqdm import tqdm

import numpy as np
import pandas as pd

import spacy
from spacy import displacy
from spacy.tokens import Span
from spacy.matcher import Matcher

In [2]:
pd.set_option('display.max_colwidth', 200)

In [3]:
# load spaCy model
nlp = spacy.load("en_core_web_sm")

## 1) Entities Extraction

## Initialize Text

#### Pattern: X such as Y

In [4]:
text = 'GDP in developing countries such as Vietnam will continue growing at a high rate.'
doc = nlp(text)

## Analyze Text Syntax 

#### 1) Dependency Parsing

In [5]:
for token in doc:
    print(token.text, '->', token.dep_)

GDP -> nsubj
in -> prep
developing -> amod
countries -> pobj
such -> amod
as -> prep
Vietnam -> pobj
will -> aux
continue -> aux
growing -> ROOT
at -> prep
a -> det
high -> amod
rate -> pobj
. -> punct


#### 2) Part of Speech (POS) Tagging

In [6]:
for token in doc:
    print(token.text, '->', token.pos_)

GDP -> PROPN
in -> ADP
developing -> VERB
countries -> NOUN
such -> ADJ
as -> SCONJ
Vietnam -> PROPN
will -> AUX
continue -> VERB
growing -> VERB
at -> ADP
a -> DET
high -> ADJ
rate -> NOUN
. -> PUNCT


## Capture Text By Pattern

In [7]:
# define the pattern 
pattern = [{'POS': 'NOUN'}, 
           {'LOWER': 'such'}, 
           {'LOWER': 'as'}, 
           {'POS': 'PROPN'}]


In [8]:
matcher = Matcher(nlp.vocab)
matcher.add('matching_1', None, pattern)

In [9]:
matches = matcher(doc)
span = doc[matches[0][1]: matches[0][2]]

print(span.text)

countries such as Vietnam


In [10]:
# define better the pattern
pattern = [{'DEP': 'amod', 'OP': '?'}, # adjectival modifier
           {'POS': 'NOUN'},
           {'LOWER': 'such'},
           {'LOWER': 'as'},
           {'POS': 'PROPN'}]

In [11]:
matcher = Matcher(nlp.vocab)
matcher.add('matching_1', None, pattern)

In [12]:
matches = matcher(doc)
span = doc[matches[0][1]: matches[0][2]]

print(span.text)

developing countries such as Vietnam


## Initialize Text

#### Pattern: X and/or Y

In [13]:
text = 'Here is how you can keep your car and other vehicles clean.'
doc = nlp(text)

## Analyze Text Syntax

#### 1) Dependency Parsing

In [14]:
for token in doc:
    print(token.text, '->', token.dep_)

Here -> advmod
is -> ROOT
how -> advmod
you -> nsubj
can -> aux
keep -> ccomp
your -> poss
car -> dobj
and -> cc
other -> amod
vehicles -> nsubj
clean -> oprd
. -> punct


#### 2) Part of Speech (POS) Tagging

In [15]:
for token in doc:
    print(token.text, '->', token.pos_)

Here -> ADV
is -> AUX
how -> ADV
you -> PRON
can -> AUX
keep -> VERB
your -> PRON
car -> NOUN
and -> CCONJ
other -> ADJ
vehicles -> NOUN
clean -> ADJ
. -> PUNCT


## Capture Text By Pattern

In [16]:
# define the pattern
pattern = [{'DEP': 'amod', 'OP': '?'}, 
           {'POS': 'NOUN'}, 
           {'LOWER': 'and', 'OP': '?'}, 
           {'LOWER': 'or', 'OP': '?'}, 
           {'LOWER': 'other'}, 
           {'POS': 'NOUN'}]

In [17]:
matcher = Matcher(nlp.vocab)
matcher.add("matching_1", None, pattern) 

In [18]:
matches = matcher(doc)
span = doc[matches[0][1]: matches[0][2]]

print(span.text)

car and other vehicles


In [19]:
# replaced 'and' with 'or'
doc = nlp('Here is how you can keep your car or other vehicles clean.')

In [20]:
matches = matcher(doc)
span = doc[matches[0][1]: matches[0][2]]

print(span.text)

car or other vehicles


## Initialize Text

#### Pattern: X, including Y

In [21]:
text = 'Eight people, including two children, were injured in the explosion.'
doc = nlp(text)

## Analyze Text Syntax

#### 1) Dependency Parsing

In [22]:
for token in doc:
    print(token.text, '->', token.dep_)

Eight -> nummod
people -> nsubjpass
, -> punct
including -> prep
two -> nummod
children -> pobj
, -> punct
were -> auxpass
injured -> ROOT
in -> prep
the -> det
explosion -> pobj
. -> punct


#### 2) Part of Speech (POS) Tagging

In [23]:
for token in doc:
    print(token.text, '->', token.pos_)

Eight -> NUM
people -> NOUN
, -> PUNCT
including -> VERB
two -> NUM
children -> NOUN
, -> PUNCT
were -> AUX
injured -> VERB
in -> ADP
the -> DET
explosion -> NOUN
. -> PUNCT


## Capture Text By Pattern

In [24]:
# define the pattern 
pattern = [{'DEP': 'nummod', 'OP': '?'}, # numeric modifier 
           {'DEP': 'amod', 'OP': '?'}, # adjectival modifier 
           {'POS': 'NOUN'}, 
           {'IS_PUNCT': True}, 
           {'LOWER': 'including'}, 
           {'DEP': 'nummod', 'OP': '?'}, 
           {'DEP': 'amod', 'OP': '?'}, 
           {'POS': 'NOUN'}] 

In [25]:
matcher = Matcher(nlp.vocab)
matcher.add("matching_1", None, pattern) 

In [26]:
matches = matcher(doc) 
span = doc[matches[0][1]:matches[0][2]] 

print(span.text)

Eight people, including two children


## Initialize Text

#### Pattern: X, especially Y

In [27]:
text = 'A healthy eating pattern includes fruits, especially whole fruits.'
doc = nlp(text)

## Analyze Text Syntax

#### 1) Dependency Parsing

In [28]:
for token in doc:
    print(token.text, '->', token.dep_)

A -> det
healthy -> amod
eating -> amod
pattern -> nsubj
includes -> ROOT
fruits -> dobj
, -> punct
especially -> advmod
whole -> amod
fruits -> appos
. -> punct


#### 2) Part of Speech (POS) Tagging

In [29]:
for token in doc:
    print(token.text, '->', token.pos_)

A -> DET
healthy -> ADJ
eating -> VERB
pattern -> NOUN
includes -> VERB
fruits -> NOUN
, -> PUNCT
especially -> ADV
whole -> ADJ
fruits -> NOUN
. -> PUNCT


## Capture Text By Pattern

In [30]:
# define the pattern 
pattern = [{'DEP': 'nummod', 'OP': '?'}, 
           {'DEP': 'amod', 'OP': '?'}, 
           {'POS': 'NOUN'}, 
           {'IS_PUNCT': True}, 
           {'LOWER': 'especially'}, 
           {'DEP': 'nummod', 'OP': '?'}, 
           {'DEP': 'amod', 'OP': '?'}, 
           {'POS': 'NOUN'}] 

In [31]:
matcher = Matcher(nlp.vocab)
matcher.add("matching_1", None, pattern) 

In [32]:
matches = matcher(doc) 
span = doc[matches[0][1]:matches[0][2]] 

print(span.text)

fruits, especially whole fruits


## 2) Relation Extraction

## Initialize Text

In [33]:
text = 'Tableau was recently acquired by Salesforce.'
doc = nlp(text) 

## Analyze Text Syntax

#### 1) Dependency Parsing

In [34]:
for token in doc:
    print(token.text, '->', token.dep_)

Tableau -> nsubjpass
was -> auxpass
recently -> advmod
acquired -> ROOT
by -> agent
Salesforce -> pobj
. -> punct


#### 2) Part of Speech (POS) Tagging

In [35]:
for token in doc:
    print(token.text, '->', token.pos_)

Tableau -> PROPN
was -> AUX
recently -> ADV
acquired -> VERB
by -> ADP
Salesforce -> PROPN
. -> PUNCT


## Display Dependency Graph

In [36]:
displacy.render(doc, style='dep', jupyter=True)

## Initialize Text

In [37]:
text = 'Careem, a ride-hailing major in the middle east, was acquired by Uber.'
doc = nlp(text) 

#### Analyze Text Syntax

#### 1) Dependency Parsing

In [38]:
for token in doc:
    print(token.text, '->', token.dep_)

Careem -> nsubjpass
, -> punct
a -> det
ride -> npadvmod
- -> punct
hailing -> amod
major -> appos
in -> prep
the -> det
middle -> compound
east -> pobj
, -> punct
was -> auxpass
acquired -> ROOT
by -> agent
Uber -> pobj
. -> punct


#### 2) Part of Speech (POS) Tagging

#### for token in doc:
    print(token.text, '->', token.pos_)

## Display Dependency Graph

In [39]:
displacy.render(doc, style='dep', jupyter=True)

---