# Advanced NLP with spaCy


## 1. Finding words, phrases, names and concepts

In [1]:
# Import the English language class
from spacy.lang.en import English

# Create the nlp object
nlp = English()

# Create by processing a string of text with the nlp object
doc = nlp("Hello world!")
type(doc)

# Itearate over tokens in a Doc
for token in doc:
    print(token.text)
    
# Index into the Doc to get a single Token
token = doc[1]
type(token)

# A slice from the Doc is a Span object
span = doc[1:4]
type(span)

# Get span text
print(span.text)


Hello
world
!
world!


### Token lexical attributes

In [2]:
doc = nlp("It costs, $5")
print('Index: ', [token.i for token in doc])
print('Text: ', [token.text for token in doc])
print('is_alpha: ', [token.is_alpha for token in doc])
print('is_punct: ', [token.is_punct for token in doc])
print('like_num: ', [token.like_num for token in doc])

Index:  [0, 1, 2, 3, 4]
Text:  ['It', 'costs', ',', '$', '5']
is_alpha:  [True, True, False, False, False]
is_punct:  [False, False, True, False, False]
like_num:  [False, False, False, False, True]


#### German example

In [3]:
# Import the German language class
from spacy.lang.de import German

# Create the nlp object
nlp = German()

# Process a text (this is German for: "Kind regards!")
doc = nlp("Liebe Grüße!")

# Print the document text
print(doc.text)

Liebe Grüße!


### Lexical attributes

In [4]:
from spacy.lang.en import English
nlp = English()

# Process the text
doc = nlp("In 1990, more than 60% of people in East Asia were in extreme poverty. Now less than 4% are.")

# Iterate over the tokens in the doc
for token in doc:
    # Check if the token resembles a number
    if token.like_num:
        # Get the next token in the document
        next_token = doc[token.i + 1]
        # Check if the next token's text equals '%'
        if next_token.text == '%':
            print('Percentage found:', token.text)

Percentage found: 60
Percentage found: 4


### Statistical models

- Enable spaCy to predict linguistic attributes in context
  - Part-of-speach tags
  - Syntatic dependencies
  - Named entities
- Trained on labeled example texts
- Can be updated with more examples to fine-tune predictions

### Pretrained model packages
- en_core_web_sm (trained on web text)
  - Binary weights
  - Vocabulary
  - Meta information (language, pipeline)
- de_core_news_sm

### Predict part of speach

In [11]:
import spacy

# Load small Eanglish model
nlp = spacy.load('en_core_web_sm')

# Process text
doc = nlp("She ate the pizza")

# Iterate over the tokens
for token in doc:
    
    # Pring the text and the predicted part-of-speach tag
    print(token.text, token.pos_)
    
    # In this case return id of element
    print(token.text, token.pos)

She PRON
She 95
ate VERB
ate 100
the DET
the 90
pizza NOUN
pizza 92


### Predict Syntatic Dependencies

In [9]:
# Process text
doc = nlp("She ate the pizza")

for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)

She PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pizza NOUN dobj ate


*Label defenitions*
*nsubj* -> nominal subject - She
*dobj* -> direct object - pizza
*det* -> determiner (article) - the

### Predict Named Entities

In [18]:
# Process a text
doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion")

# Iterate over the predicted entities
for ent in doc.ents:
    
    # Print entity text and its label
    print(ent.text, ent.label_, ent.label)

Apple ORG 383
U.K. GPE 384
$1 billion MONEY 394


GPE - geo political entity
To provide quick definitions to tags and labels used following method

In [17]:
# SYM, relcl, quantmod etc.
spacy.explain('quantmod')

'modifier of quantifier'

### Predicting linguistic annotations

In [15]:
text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = nlp(text)

for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    # This is for formatting only
    print('{:<12}{:<10}{:<10}'.format(token_text, token_pos, token_dep))

It          PRON      nsubj     
’s          VERB      compound  
official    NOUN      ROOT      
:           PUNCT     punct     
Apple       PROPN     nsubj     
is          AUX       ROOT      
the         DET       det       
first       ADJ       amod      
U.S.        PROPN     nmod      
public      ADJ       amod      
company     NOUN      attr      
to          PART      aux       
reach       VERB      relcl     
a           DET       det       
$           SYM       quantmod  
1           NUM       compound  
trillion    NUM       nummod    
market      NOUN      compound  
value       NOUN      dobj      


### Rule based matcher

- Matcher on Doc objects, not just strings
- Matcher on tokens and token attributes
- Use the model's predictions
- Example: find "duck" (verb) not "duck" (noun)

#### Match patterns
- List of dictionaries, one per token
- Match exact token texts
In this example looking for two tokens in the text:
```
[{'ORTH': 'iPhone'}, {'ORTH': 'X'}]
```
- Match lexical attributes
```
[{'LOWER': 'iphone'}, {'LOWER': 'x'}]
```
- Match any token attributes
Lemma is a base form it match:
  - buying a milk
  - drinking a coffe
```
[{'LEMMA': 'buy'}, {'POS': 'NOUN'}]
```

In [20]:
import spacy

# Import matcher
from spacy.matcher import Matcher

# Load a model and create the nlp object
nlp = spacy.load('en_core_web_sm')

# initialize the matcher with the shared vocab
matcher = Matcher(nlp.vocab)

# Add the pattern to the matcher
pattern = [{'ORTH': 'iPhone'}, {'ORTH': 'X'}]
matcher.add('IPHONE_PATTERN', None, pattern)

# Process some text
doc = nlp("New iPhone X release date leaked")

# Call the matcher on the doc
matches = matcher(doc)

for match_id, start, end in matches:
    # match_id: hash value of the pattern name
    # start: start index of matched span
    # end: end index of matched span
    
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

iPhone X


In [25]:
matcher = Matcher(nlp.vocab)
pattern = [
    {'IS_DIGIT': True},
    {'LOWER': 'fifa'},
    {'LOWER': 'world'},
    {'LOWER': 'cup'},
    {'IS_PUNCT': True}
]
matcher.add('WORLD_CUP_PATTERN', None, pattern)

doc = nlp("2018 FIFA World Cup: France won!")
matches = matcher(doc)
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

2018 FIFA World Cup:


In [31]:
matcher = Matcher(nlp.vocab)
pattern = [
    {'LEMMA': 'love', 'POS': 'VERB'},
    {'POS': 'NOUN'}
]
matcher.add('LOVE', None, pattern)
doc = nlp("I loved dogs but now I love cats more.")

matches = matcher(doc)
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

loved dogs
love cats


In [47]:
# Matching operators and quantifiers
matcher = Matcher(nlp.vocab)
pattern = [
    {'LEMMA': 'buy'},
    {'POS': 'DET', 'OP': '?'}, # optional: match 0 or 1 times
    {'POS': 'NOUN'}
]
matcher.add('BUY_THINGS', None, pattern)

doc = nlp("I bought a smartphone. Now I'm buying apps")
matches = matcher(doc)
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

bought a smartphone
buying apps


```{'OP': '!'}``` - negation: match 0 times
```{'OP': '?'}``` - optional: match 0 or 1 times
```{'OP': '+'}``` - match 1 or more times
```{'OP': '*'}``` - match 0 or more times

#### Examples

In [48]:
# Using the Matcher
# Import the Matcher and initialize it with the shared vocabulary
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

# Create a pattern matching two tokens: "iPhone" and "X"
pattern = [{'TEXT': 'iPhone'}, {'TEXT': 'X'}]

# Add the pattern to the matcher
matcher.add('IPHONE_X_PATTERN', None, pattern)

doc = nlp("New iPhone X release date leaked as Apple reveals pre-orders by mistake")

# Use the matcher on the doc
matches = matcher(doc)
print('Matches:', [doc[start:end].text for match_id, start, end in matches])

Matches: ['iPhone X']


In [43]:
# Writing match patterns
matcher = Matcher(nlp.vocab)
doc = nlp("i downloaded Fortnite on my laptop and can't open the game at all. Help? so when I was downloading Minecraft, I got the Windows version where it is the '.zip' folder and I used the default program to unpack it... do I also need to download Winzip?")

# Write a pattern that matches a form of "download" plus proper noun
pattern = [{'LEMMA': 'download'}, {'POS': 'PROPN'}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add('DOWNLOAD_THINGS_PATTERN', None, pattern)
matches = matcher(doc)
print('Total matches found:', len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print('Match found:', doc[start:end].text)

Total matches found: 3
Match found: downloaded Fortnite
Match found: downloading Minecraft
Match found: download Winzip


In [45]:
# Writing match patterns
matcher = Matcher(nlp.vocab)
doc = nlp("Features of the app include a beautiful design, smart search, automatic labels and optional voice responses.")

# Write a pattern for adjective plus one or two nouns
pattern = [{'POS': 'ADJ'}, {'POS': 'NOUN'}, {'POS': 'NOUN', 'OP': '?'}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add('ADJ_NOUN_PATTERN', None, pattern)
matches = matcher(doc)
print('Total matches found:', len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print('Match found:', doc[start:end].text)

Total matches found: 5
Match found: beautiful design
Match found: smart search
Match found: automatic labels
Match found: optional voice
Match found: optional voice responses



## 2. Large-scale data analysis with spaCy

### Data structures: Vocab, Lexemes and StringStore

#### Shared vocab and string store
 - Vocab: stores data shared accross multiple documents
 - To save memory, spyCy encodes all strings to hash values
 - Strings are only stored once in the StringStore via npl.vocab.strings
 - String store: lookup table in both directions

In [51]:
coffee_hash = nlp.vocab.strings['coffee']
coffee_string = nlp.vocab.strings['coffee_hash']
print(coffee_hash)
print(coffee_string)

3197928453018144401
12035782083212280080


 - hashes can't be reversed - that's why we need to provice the shared vocab
 - lookup the string and hash in nlp.vocab.strings

In [56]:
doc = nlp("I love coffee")

hash_value = nlp.vocab.strings['coffee']
print('hash value:', hash_value)

# Rise an error if we haven't seen the string before
string = nlp.vocab.strings[hash_value]
print('string value: ', string)

hash value: 3197928453018144401
string value:  coffee


 - the doc also exposes the vocab and strings

In [58]:
doc = nlp("I love coffee")
print('hash value: ', doc.vocab.strings['coffee'])

hash value:  3197928453018144401


#### Lexemes: entries in the vocabulary
 - a Lexeme object is an entry in the vocabulary

In [61]:
doc = nlp("I love coffee")
lexeme = nlp.vocab['coffee']
# print the lexical attributes
print(lexeme.text, lexeme.orth, lexeme.is_alpha)

coffee 3197928453018144401 True


 - contains the context-independent information about word
    - word text: lexeme.text and lexeme.orth (the hash)
    - lexical attributes like lexeme.is_alpha
    - not context-dependent part-of-speach tags, dependencies or entity labels

![id](images/vocab_hashes_lexemes.png "Vocab, hashes and lexemes") 

### Data structures: Doc, Span and Token

In [71]:
# Create an nlp ogject
from spacy.lang.en import English
nlp = English()

# Import the Doc class
from spacy.tokens import Doc, Span

# The words and spaces to create the doc from
words = ['Hello', 'world', '!']
spaces = [True, False, False]

# Create a doc manually
doc = Doc(nlp.vocab, words = words, spaces = spaces)
print("Doc text: ", doc.text)

# Create a span manually
span = Span(doc, 0, 2)
print("Span text: ", span.text)

# Create a span with a label
span_with_label = Span(doc, 0, 2, label = "GREETING")
print("Span with label text: ", span_with_label.text)

# Add span to the doc.ents
doc.ents = [span_with_label]
print("New doc text: ", doc.text)

Doc text:  Hello world!
Span text:  Hello world
Span with label text:  Hello world
New doc text:  Hello world!


![id](images/span_object.png "The Span object")

#### Best practices
 - Doc and Span are very poverfull and hod reference and relationships of words and sentences
   - Convert result to strings as late as possible
   - Use token attributes if available - for example, token.i for the token index
 

#### Excercise

In [72]:
# Excersise
# Import the Doc and Span classes
from spacy.tokens import Doc, Span

# Create a doc from the words and spaces
doc = Doc(nlp.vocab, words=['I', 'like', 'David', 'Bowie'], spaces=[True, True, True, False])

# Create a span for "David Bowie" from the doc and assign it the label "PERSON"
span = Span(doc, 2, 4, label='PERSON')

# Add the span to the doc's entities
doc.ents = [span]

# Print entities' text and labels
print([(ent.text, ent.label_) for ent in doc.ents])

[('David Bowie', 'PERSON')]


In [98]:
# Excersise
nlp = spacy.load('en_core_web_sm')
doc = nlp("Berlin is a nice city")
# Get all tokens and part-of-speech tags
pos_tags = [token.pos_ for token in doc]

for token in doc:
    # Check if the current token is a proper noun
    if token.pos_ == 'PROPN':
        # Check if the next token is a verb
        if doc[token.i + 1].pos_ == 'AUX':
            print('Found a verb after a proper noun!')

Found a verb after a proper noun!


### Word vector and semantic similarity

#### Comparing semantic similarity
- spaCy can compare two objects and predict similarity (Span, Doc, Token)
- Doc.similarity(), Span.similarity() and Token.similarity()
- Take another object and return a similarity score (0 to 1)
- Important: needs a model that has word vector included, for example:
  - YES: en_core_web_md (medium model)
  - YES: en_core_web_lg (large model)
  - YES: en_core_web_sm (small model)

#### Similarity examples

In [3]:
import spacy

# Load a large model with vectors
nlp = spacy.load('en_core_web_md')

# Compare two documents
doc1 = nlp("I like fast food")
doc2 = nlp("I like pizza")
print(doc1.similarity(doc2))

I like fast food
0.8627203210548107


In [4]:
# Compare two tokens
doc = nlp("I like pizza and pasta")
token1 = doc[2]
token2 = doc[4]
print(token1.similarity(token2))

0.7369546


In [5]:
# Compare a document with a token
doc = nlp("I like pizza")
token=nlp("soap")[0]

print(doc.similarity(token))

0.3253198600655889


In [6]:
# Compare a span with a document
span = nlp("I like pizza and pasta")[2:5]
doc = nlp("McDonalds sells burgers")

print(span.similarity(doc))

0.6199092090831612


#### How does spaCy predict similarity?
- Similarity is determined using word vectors
- Multy-dimensional meaning representations of words
- Generated using an algorithm like Word2Vec and lots of text
- Can be added to spaCy's statistical model
- Default: consine similarity, but can be adjusted
- Doc and Span vectors default to average of token vectors
- Short phrases are better than long documents with many irrelevant words

#### Word vector in spaCy

In [10]:
doc = nlp("I have a banana")
# Access the vector via the token.vector attribute
print(doc[3].vector)

[ 2.0228e-01 -7.6618e-02  3.7032e-01  3.2845e-02 -4.1957e-01  7.2069e-02
 -3.7476e-01  5.7460e-02 -1.2401e-02  5.2949e-01 -5.2380e-01 -1.9771e-01
 -3.4147e-01  5.3317e-01 -2.5331e-02  1.7380e-01  1.6772e-01  8.3984e-01
  5.5107e-02  1.0547e-01  3.7872e-01  2.4275e-01  1.4745e-02  5.5951e-01
  1.2521e-01 -6.7596e-01  3.5842e-01 -4.0028e-02  9.5949e-02 -5.0690e-01
 -8.5318e-02  1.7980e-01  3.3867e-01  1.3230e-01  3.1021e-01  2.1878e-01
  1.6853e-01  1.9874e-01 -5.7385e-01 -1.0649e-01  2.6669e-01  1.2838e-01
 -1.2803e-01 -1.3284e-01  1.2657e-01  8.6723e-01  9.6721e-02  4.8306e-01
  2.1271e-01 -5.4990e-02 -8.2425e-02  2.2408e-01  2.3975e-01 -6.2260e-02
  6.2194e-01 -5.9900e-01  4.3201e-01  2.8143e-01  3.3842e-02 -4.8815e-01
 -2.1359e-01  2.7401e-01  2.4095e-01  4.5950e-01 -1.8605e-01 -1.0497e+00
 -9.7305e-02 -1.8908e-01 -7.0929e-01  4.0195e-01 -1.8768e-01  5.1687e-01
  1.2520e-01  8.4150e-01  1.2097e-01  8.8239e-02 -2.9196e-02  1.2151e-03
  5.6825e-02 -2.7421e-01  2.5564e-01  6.9793e-02 -2

#### Similarity depends on the application context
- Useful for many applications: recommendation system, flagging duplicates etc.
- There's no objective definition of "similarity"
- Depends on the context and what application needs to do

In [11]:
# Texts is similar bcs both of them about cats
doc1 = nlp("I like cats")
doc2 = nlp("I hate cats")
print(doc1.similarity(doc2))

0.9501447503553421


In [12]:
doc1 = nlp("like")
doc2 = nlp("hate")
print(doc1.similarity(doc2))

0.6574650996652592


### Combining models and rules

|                     | Statistical models                                          | Rule-based systems                                     |   |   |
|---------------------|-------------------------------------------------------------|--------------------------------------------------------|---|---|
| Use cases           | applications needs to generalize based on examples          | dictionary with finite number of examples              |   |   |
| Real-world examples | product names, person names, subject/object relationships   | countries of the world, cities, drug names, dog breeds |   |   |
| spaCy features      | entity recognizer, dependency parser, part-of-speech tagger | tokenizer, Matcher, PhraseMatcher                      |   |   |

#### Recap: Rule-based Matching

In [27]:
# Initialize with the shared vocab
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

# Patterns are lists of dictionaries describing the tokens
# pattern = [{'LEMMA': 'love', 'POS': 'VERB'}, {'LOWER', 'cats'}]
# matcher.add('LOVE_CATS', None, pattern)

# Operators can specify how often a token should be matched
pattern = [{'TEXT': 'very', 'OP': '+'}, {'TEXT': 'happy'}]
matcher.add('HAPPY', None, pattern)

# Calling matcher on doc returns list of (match_id, start, end) tuples
doc = nlp("I love cats and I'm very very happy")
matches = matcher(doc)

print('Matches:', [doc[start:end].text for match_id, start, end in matches])

Matches: ['very happy', 'very very happy']


#### Adding statistical predictions

In [32]:
matcher = Matcher(nlp.vocab)
matcher.add('DOG', None, [{'LOWER': 'golden'}, {'LOWER': 'retriver'}])
doc = nlp("I have a Golden Retriver")

for match_id, start, end in matcher(doc):
    span = doc[start:end]
    print('Matcher span:', span.text)
    
    # Get the span's root token and root head token
    print('Root token:', span.root.text)
    print('Root head token:', span.root.head.text)
    
    # Get the previous token and its POS tag
    print('Previous token: ', doc[start - 1].text, doc[start - 1].pos_)

Matcher span: Golden Retriver
Root token: Retriver
Root head token: have
Previous token:  a DET


#### Efficient phrase matching
- PhraseMatcher like regular expressions or keyword search - but access to the tokens!
- Takes Doc object as patterns
- More efficient and faster than the Matcher
- Great for matching large word lists

In [34]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)
pattern = nlp("Golden Retriver")
matcher.add('DOG', None, pattern)
doc = nlp("I have a Golden Retriver")

for match_id, start, end in matcher(doc):
    span = doc[start:end]
    print('Matcher span:', span.text)

Matcher span: Golden Retriver


#### Debugging patterns

In [59]:
doc = nlp("Twitch Prime, the perks program for Amazon Prime members offering free loot, games and other benefits, is ditching one of its best features: ad-free viewing. According to an email sent out to Amazon Prime members today, ad-free viewing will no longer be included as a part of Twitch Prime for new members, beginning on September 14. However, members with existing annual subscriptions will be able to continue to enjoy ad-free viewing until their subscription comes up for renewal. Those with monthly subscriptions will have access to ad-free viewing until October 15.")

# Create the match patterns
pattern1 = [{'LOWER': 'amazon'}, {'IS_TITLE': True, 'POS': 'PROPN'}]
pattern2 = [{'LOWER': 'ad'}, {'LOWER': '-'}, {'LOWER': 'free'}, {'POS': 'NOUN'}]

# Initialize the Matcher and add the patterns
matcher = Matcher(nlp.vocab)
matcher.add('PATTERN1', None, pattern1)
matcher.add('PATTERN2', None, pattern2)

# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Print pattern string name and text of matched span
    print(doc.vocab.strings[match_id], doc[start:end].text)

PATTERN1 Amazon Prime
PATTERN2 ad-free viewing
PATTERN1 Amazon Prime
PATTERN2 ad-free viewing
PATTERN2 ad-free viewing
PATTERN2 ad-free viewing


In [58]:
[token.text for token in nlp("ad-free viewing")]

['ad', '-', 'free', 'viewing']

In [64]:
COUNTRIES = ['Afghanistan',
 'Åland Islands',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Anguilla',
 'Antarctica',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia (Plurinational State of)',
 'Bonaire, Sint Eustatius and Saba',
 'Bosnia and Herzegovina',
 'Botswana',
 'Bouvet Island',
 'Brazil',
 'British Indian Ocean Territory',
 'United States Minor Outlying Islands',
 'Virgin Islands (British)',
 'Virgin Islands (U.S.)',
 'Brunei Darussalam',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cabo Verde',
 'Cayman Islands',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Christmas Island',
 'Cocos (Keeling) Islands',
 'Colombia',
 'Comoros',
 'Congo',
 'Congo (Democratic Republic of the)',
 'Cook Islands',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Curaçao',
 'Cyprus',
 'Czech Republic',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Ethiopia',
 'Falkland Islands (Malvinas)',
 'Faroe Islands',
 'Fiji',
 'Finland',
 'France',
 'French Guiana',
 'French Polynesia',
 'French Southern Territories',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Gibraltar',
 'Greece',
 'Greenland',
 'Grenada',
 'Guadeloupe',
 'Guam',
 'Guatemala',
 'Guernsey',
 'Guinea',
 'Guinea-Bissau',
 'Guyana',
 'Haiti',
 'Heard Island and McDonald Islands',
 'Holy See',
 'Honduras',
 'Hong Kong',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 "Côte d'Ivoire",
 'Iran (Islamic Republic of)',
 'Iraq',
 'Ireland',
 'Isle of Man',
 'Israel',
 'Italy',
 'Jamaica',
 'Japan',
 'Jersey',
 'Jordan',
 'Kazakhstan',
 'Kenya',
 'Kiribati',
 'Kuwait',
 'Kyrgyzstan',
 "Lao People's Democratic Republic",
 'Latvia',
 'Lebanon',
 'Lesotho',
 'Liberia',
 'Libya',
 'Liechtenstein',
 'Lithuania',
 'Luxembourg',
 'Macao',
 'Macedonia (the former Yugoslav Republic of)',
 'Madagascar',
 'Malawi',
 'Malaysia',
 'Maldives',
 'Mali',
 'Malta',
 'Marshall Islands',
 'Martinique',
 'Mauritania',
 'Mauritius',
 'Mayotte',
 'Mexico',
 'Micronesia (Federated States of)',
 'Moldova (Republic of)',
 'Monaco',
 'Mongolia',
 'Montenegro',
 'Montserrat',
 'Morocco',
 'Mozambique',
 'Myanmar',
 'Namibia',
 'Nauru',
 'Nepal',
 'Netherlands',
 'New Caledonia',
 'New Zealand',
 'Nicaragua',
 'Niger',
 'Nigeria',
 'Niue',
 'Norfolk Island',
 "Korea (Democratic People's Republic of)",
 'Northern Mariana Islands',
 'Norway',
 'Oman',
 'Pakistan',
 'Palau',
 'Palestine, State of',
 'Panama',
 'Papua New Guinea',
 'Paraguay',
 'Peru',
 'Philippines',
 'Pitcairn',
 'Poland',
 'Portugal',
 'Puerto Rico',
 'Qatar',
 'Republic of Kosovo',
 'Réunion',
 'Romania',
 'Russian Federation',
 'Rwanda',
 'Saint Barthélemy',
 'Saint Helena, Ascension and Tristan da Cunha',
 'Saint Kitts and Nevis',
 'Saint Lucia',
 'Saint Martin (French part)',
 'Saint Pierre and Miquelon',
 'Saint Vincent and the Grenadines',
 'Samoa',
 'San Marino',
 'Sao Tome and Principe',
 'Saudi Arabia',
 'Senegal',
 'Serbia',
 'Seychelles',
 'Sierra Leone',
 'Singapore',
 'Sint Maarten (Dutch part)',
 'Slovakia',
 'Slovenia',
 'Solomon Islands',
 'Somalia',
 'South Africa',
 'South Georgia and the South Sandwich Islands',
 'Korea (Republic of)',
 'South Sudan',
 'Spain',
 'Sri Lanka',
 'Sudan',
 'Suriname',
 'Svalbard and Jan Mayen',
 'Swaziland',
 'Sweden',
 'Switzerland',
 'Syrian Arab Republic',
 'Taiwan',
 'Tajikistan',
 'Tanzania, United Republic of',
 'Thailand',
 'Timor-Leste',
 'Togo',
 'Tokelau',
 'Tonga',
 'Trinidad and Tobago',
 'Tunisia',
 'Turkey',
 'Turkmenistan',
 'Turks and Caicos Islands',
 'Tuvalu',
 'Uganda',
 'Ukraine',
 'United Arab Emirates',
 'United Kingdom of Great Britain and Northern Ireland',
 'United States of America',
 'Uruguay',
 'Uzbekistan',
 'Vanuatu',
 'Venezuela (Bolivarian Republic of)',
 'Viet Nam',
 'Wallis and Futuna',
 'Western Sahara',
 'Yemen',
 'Zambia',
 'Zimbabwe']

# Import the PhraseMatcher and initialize it
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

# Create pattern Doc objects and add them to the matcher
# This is the faster version of: [nlp(country) for country in COUNTRIES]
patterns = list(nlp.pipe(COUNTRIES))
matcher.add('COUNTRY', None, *patterns)
doc = nlp("Czech Republic may help Slovakia protect its airspace")
# Call the matcher on the test document and print the result
matches = matcher(doc)
print([doc[start:end] for match_id, start, end in matches])

# Result:
# [Czech Republic, Slovakia]

[Czech Republic, Slovakia]


In [82]:
# Extracting countries and relationship
from spacy.tokens import Doc, Span

COUNTRIES = ['Afghanistan',
 'Åland Islands',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Anguilla',
 'Antarctica',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia (Plurinational State of)',
 'Bonaire, Sint Eustatius and Saba',
 'Bosnia and Herzegovina',
 'Botswana',
 'Bouvet Island',
 'Brazil',
 'British Indian Ocean Territory',
 'United States Minor Outlying Islands',
 'Virgin Islands (British)',
 'Virgin Islands (U.S.)',
 'Brunei Darussalam',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cabo Verde',
 'Cayman Islands',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Christmas Island',
 'Cocos (Keeling) Islands',
 'Colombia',
 'Comoros',
 'Congo',
 'Congo (Democratic Republic of the)',
 'Cook Islands',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Curaçao',
 'Cyprus',
 'Czech Republic',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Ethiopia',
 'Falkland Islands (Malvinas)',
 'Faroe Islands',
 'Fiji',
 'Finland',
 'France',
 'French Guiana',
 'French Polynesia',
 'French Southern Territories',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Gibraltar',
 'Greece',
 'Greenland',
 'Grenada',
 'Guadeloupe',
 'Guam',
 'Guatemala',
 'Guernsey',
 'Guinea',
 'Guinea-Bissau',
 'Guyana',
 'Haiti',
 'Heard Island and McDonald Islands',
 'Holy See',
 'Honduras',
 'Hong Kong',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 "Côte d'Ivoire",
 'Iran (Islamic Republic of)',
 'Iraq',
 'Ireland',
 'Isle of Man',
 'Israel',
 'Italy',
 'Jamaica',
 'Japan',
 'Jersey',
 'Jordan',
 'Kazakhstan',
 'Kenya',
 'Kiribati',
 'Kuwait',
 'Kyrgyzstan',
 "Lao People's Democratic Republic",
 'Latvia',
 'Lebanon',
 'Lesotho',
 'Liberia',
 'Libya',
 'Liechtenstein',
 'Lithuania',
 'Luxembourg',
 'Macao',
 'Macedonia (the former Yugoslav Republic of)',
 'Madagascar',
 'Malawi',
 'Malaysia',
 'Maldives',
 'Mali',
 'Malta',
 'Marshall Islands',
 'Martinique',
 'Mauritania',
 'Mauritius',
 'Mayotte',
 'Mexico',
 'Micronesia (Federated States of)',
 'Moldova (Republic of)',
 'Monaco',
 'Mongolia',
 'Montenegro',
 'Montserrat',
 'Morocco',
 'Mozambique',
 'Myanmar',
 'Namibia',
 'Nauru',
 'Nepal',
 'Netherlands',
 'New Caledonia',
 'New Zealand',
 'Nicaragua',
 'Niger',
 'Nigeria',
 'Niue',
 'Norfolk Island',
 "Korea (Democratic People's Republic of)",
 'Northern Mariana Islands',
 'Norway',
 'Oman',
 'Pakistan',
 'Palau',
 'Palestine, State of',
 'Panama',
 'Papua New Guinea',
 'Paraguay',
 'Peru',
 'Philippines',
 'Pitcairn',
 'Poland',
 'Portugal',
 'Puerto Rico',
 'Qatar',
 'Republic of Kosovo',
 'Réunion',
 'Romania',
 'Russian Federation',
 'Rwanda',
 'Saint Barthélemy',
 'Saint Helena, Ascension and Tristan da Cunha',
 'Saint Kitts and Nevis',
 'Saint Lucia',
 'Saint Martin (French part)',
 'Saint Pierre and Miquelon',
 'Saint Vincent and the Grenadines',
 'Samoa',
 'San Marino',
 'Sao Tome and Principe',
 'Saudi Arabia',
 'Senegal',
 'Serbia',
 'Seychelles',
 'Sierra Leone',
 'Singapore',
 'Sint Maarten (Dutch part)',
 'Slovakia',
 'Slovenia',
 'Solomon Islands',
 'Somalia',
 'South Africa',
 'South Georgia and the South Sandwich Islands',
 'Korea (Republic of)',
 'South Sudan',
 'Spain',
 'Sri Lanka',
 'Sudan',
 'Suriname',
 'Svalbard and Jan Mayen',
 'Swaziland',
 'Sweden',
 'Switzerland',
 'Syrian Arab Republic',
 'Taiwan',
 'Tajikistan',
 'Tanzania, United Republic of',
 'Thailand',
 'Timor-Leste',
 'Togo',
 'Tokelau',
 'Tonga',
 'Trinidad and Tobago',
 'Tunisia',
 'Turkey',
 'Turkmenistan',
 'Turks and Caicos Islands',
 'Tuvalu',
 'Uganda',
 'Ukraine',
 'United Arab Emirates',
 'United Kingdom of Great Britain and Northern Ireland',
 'United States of America',
 'Uruguay',
 'Uzbekistan',
 'Vanuatu',
 'Venezuela (Bolivarian Republic of)',
 'Viet Nam',
 'Wallis and Futuna',
 'Western Sahara',
 'Yemen',
 'Zambia',
 'Zimbabwe']

# Import the PhraseMatcher and initialize it
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

# Create pattern Doc objects and add them to the matcher
# This is the faster version of: [nlp(country) for country in COUNTRIES]
patterns = list(nlp.pipe(COUNTRIES))
matcher.add('COUNTRY', None, *patterns)

# Create a doc and find matches in it
text = "After the Cold War, the UN saw a radical expansion in its peacekeeping duties, taking on more missions in ten years than it had in the previous four decades.Between 1988 and 2000, the number of adopted Security Council resolutions more than doubled, and the peacekeeping budget increased more than tenfold. The UN negotiated an end to the Salvadoran Civil War, launched a successful peacekeeping mission in Namibia, and oversaw democratic elections in post-apartheid South Africa and post-Khmer Rouge Cambodia. In 1991, the UN authorized a US-led coalition that repulsed the Iraqi invasion of Kuwait. Brian Urquhart, Under-Secretary-General from 1971 to 1985, later described the hopes raised by these successes as a \"false renaissance\" for the organization, given the more troubled missions that followed. Though the UN Charter had been written primarily to prevent aggression by one nation against another, in the early 1990s the UN faced a number of simultaneous, serious crises within nations such as Somalia, Haiti, Mozambique, and the former Yugoslavia. The UN mission in Somalia was widely viewed as a failure after the US withdrawal following casualties in the Battle of Mogadishu, and the UN mission to Bosnia faced \"worldwide ridicule\" for its indecisive and confused mission in the face of ethnic cleansing. In 1994, the UN Assistance Mission for Rwanda failed to intervene in the Rwandan genocide amid indecision in the Security Council. Beginning in the last decades of the Cold War, American and European critics of the UN condemned the organization for perceived mismanagement and corruption. In 1984, the US President, Ronald Reagan, withdrew his nation's funding from UNESCO (the United Nations Educational, Scientific and Cultural Organization, founded 1946) over allegations of mismanagement, followed by Britain and Singapore. Boutros Boutros-Ghali, Secretary-General from 1992 to 1996, initiated a reform of the Secretariat, reducing the size of the organization somewhat. His successor, Kofi Annan (1997–2006), initiated further management reforms in the face of threats from the United States to withhold its UN dues. In the late 1990s and 2000s, international interventions authorized by the UN took a wider variety of forms. The UN mission in the Sierra Leone Civil War of 1991–2002 was supplemented by British Royal Marines, and the invasion of Afghanistan in 2001 was overseen by NATO. In 2003, the United States invaded Iraq despite failing to pass a UN Security Council resolution for authorization, prompting a new round of questioning of the organization's effectiveness. Under the eighth Secretary-General, Ban Ki-moon, the UN has intervened with peacekeepers in crises including the War in Darfur in Sudan and the Kivu conflict in the Democratic Republic of Congo and sent observers and chemical weapons inspectors to the Syrian Civil War. In 2013, an internal review of UN actions in the final battles of the Sri Lankan Civil War in 2009 concluded that the organization had suffered \"systemic failure\". One hundred and one UN personnel died in the 2010 Haiti earthquake, the worst loss of life in the organization's history. The Millennium Summit was held in 2000 to discuss the UN's role in the 21st century. The three day meeting was the largest gathering of world leaders in history, and culminated in the adoption by all member states of the Millennium Development Goals (MDGs), a commitment to achieve international development in areas such as poverty reduction, gender equality, and public health. Progress towards these goals, which were to be met by 2015, was ultimately uneven. The 2005 World Summit reaffirmed the UN's focus on promoting development, peacekeeping, human rights, and global security. The Sustainable Development Goals were launched in 2015 to succeed the Millennium Development Goals. In addition to addressing global challenges, the UN has sought to improve its accountability and democratic legitimacy by engaging more with civil society and fostering a global constituency. In an effort to enhance transparency, in 2016 the organization held its first public debate between candidates for Secretary-General. On 1 January 2017, Portuguese diplomat António Guterres, who previously served as UN High Commissioner for Refugees, became the ninth Secretary-General. Guterres has highlighted several key goals for his administration, including an emphasis on diplomacy for preventing conflicts, more effective peacekeeping efforts, and streamlining the organization to be more responsive and versatile to global needs."
doc = nlp(text)

# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Create a Span with the label for "GPE"
    span = Span(doc, start, end, label="GPE")
    print(span)

    # Overwrite the doc.ents and add the span
    doc.ents = list(doc.ents) + #[span] # overlapping issue here that need to be fixed

    # Get the span's root head token
    span_root_head = span.root.head
    # Print the text of the span root's head token and the span text
    print(span_root_head.text, '-->', span.text)
# Print the entities in the document
print([(ent.text, ent.label_) for ent in doc.ents if ent.label_ == 'GPE'])

Namibia
in --> Namibia
South Africa
in --> South Africa
Cambodia
Africa --> Cambodia
Kuwait
of --> Kuwait
Somalia
as --> Somalia
Haiti
Somalia --> Haiti
Mozambique
Haiti --> Mozambique
Somalia
in --> Somalia
Rwanda
for --> Rwanda
Singapore
Britain --> Singapore
Sierra Leone
War --> Sierra Leone
Afghanistan
of --> Afghanistan
Iraq
invaded --> Iraq
Sudan
in --> Sudan
Congo
of --> Congo
Haiti
earthquake --> Haiti
[('Namibia', 'GPE'), ('South Africa', 'GPE'), ('US', 'GPE'), ('Kuwait', 'GPE'), ('Somalia', 'GPE'), ('Haiti', 'GPE'), ('Mozambique', 'GPE'), ('Yugoslavia', 'GPE'), ('Somalia', 'GPE'), ('US', 'GPE'), ('Mogadishu', 'GPE'), ('Bosnia', 'GPE'), ('Rwanda', 'GPE'), ('US', 'GPE'), ('Britain', 'GPE'), ('Singapore', 'GPE'), ('the United States', 'GPE'), ('Afghanistan', 'GPE'), ('the United States', 'GPE'), ('Iraq', 'GPE'), ('Darfur', 'GPE'), ('Sudan', 'GPE'), ('Kivu', 'GPE'), ('the Democratic Republic of Congo', 'GPE'), ('Haiti', 'GPE')]


## 3. Processing Pipelines