# Pipeline

In [1]:
import spacy
spacy.require_gpu()
nlp = spacy.load('en_core_web_lg')
nlp.meta

{'lang': 'en',
 'name': 'core_web_lg',
 'version': '3.4.1',
 'description': 'English pipeline optimized for CPU. Components: tok2vec, tagger, parser, senter, ner, attribute_ruler, lemmatizer.',
 'author': 'Explosion',
 'email': 'contact@explosion.ai',
 'url': 'https://explosion.ai',
 'license': 'MIT',
 'spacy_version': '>=3.4.0,<3.5.0',
 'spacy_git_version': '2b5f955c2',
 'vectors': {'width': 300,
  'vectors': 514157,
  'keys': 514157,
  'name': 'en_vectors',
  'mode': 'default'},
 'labels': {'tok2vec': [],
  'tagger': ['$',
   "''",
   ',',
   '-LRB-',
   '-RRB-',
   '.',
   ':',
   'ADD',
   'AFX',
   'CC',
   'CD',
   'DT',
   'EX',
   'FW',
   'HYPH',
   'IN',
   'JJ',
   'JJR',
   'JJS',
   'LS',
   'MD',
   'NFP',
   'NN',
   'NNP',
   'NNPS',
   'NNS',
   'PDT',
   'POS',
   'PRP',
   'PRP$',
   'RB',
   'RBR',
   'RBS',
   'RP',
   'SYM',
   'TO',
   'UH',
   'VB',
   'VBD',
   'VBG',
   'VBN',
   'VBP',
   'VBZ',
   'WDT',
   'WP',
   'WP$',
   'WRB',
   'XX',
   '_SP',
   '``

In [2]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [3]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7ff4926ded60>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7ff4926deb80>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7ff4926dd4a0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7ff4925340c0>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7ff4924e0700>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7ff4926dd2e0>)]

# Custom pipeline components

In [4]:
from spacy import Language
nlp = spacy.load('en_core_web_lg')
# Define a custom component
@Language.component('custom_component')
def custom_component(doc):
    # Print the doc's length
    print('Doc length:', len(doc))
    # Return the doc object
    return doc

# Add the component first in the pipeline
nlp.add_pipe('custom_component', first=True)
# Print the pipeline component names
print('Pipeline:', nlp.pipe_names)
# run the custom pipeline
doc = nlp('Hello world!')

Pipeline: ['custom_component', 'tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
Doc length: 3


# Complex components

In [5]:
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
nlp = spacy.load('en_core_web_lg')
# Define a custom component

matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", [nlp('Golden Retriever')])
matcher.add("ANIMAL", [nlp('cat')])

@Language.component('animal_component')
def animal_component(doc):
    # Create a Span for each match and assign the label 'ANIMAL'
    # and overwrite the doc.ents with the matched spans
    doc.ents = [Span(doc, start, end, label='ANIMAL')
                for match_id, start, end in matcher(doc)]
    return doc
    
# Add the component to the pipeline after the 'ner' component 
nlp.add_pipe('animal_component', after='ner')

# Process the text and print the text and label for the doc.ents
doc = nlp("I have a cat and a Golden Retriever")
print([(ent.text, ent.label_) for ent in doc.ents])

[('cat', 'ANIMAL'), ('Golden Retriever', 'ANIMAL')]


In [6]:
[nlp.vocab.strings[match_id] for match_id, start, end in matcher(doc)]

['ANIMAL', 'ANIMAL']

# Extension attribute types
1. Attribute extensions
2. Property extensions
3. Method extension

## Attribute extensions
* Set a default value that can be overwritten

In [7]:
from spacy.tokens import Token
# Set extension on the Token with default value
Token.set_extension('is_color', default=False, force=True)
doc = nlp("The sky is blue.")
# Overwrite extension attribute value
doc[3]._.is_color = True
doc[3].text, doc[3]._.is_color

('blue', True)

## Property extensions

* Define a getter and an optional setter function
* Getter only called when you retrieve the attribute value

In [8]:
# Define getter function
def get_is_color(token):
    colors = ['red', 'yellow', 'blue']
    return token.text in colors

# Set extension on the Token with getter
Token.set_extension('is_color', getter=get_is_color, force=True)

doc = nlp("The sky is blue.")
print(doc[3]._.is_color, '-', doc[3].text)

True - blue


* Span extensions should always use a getter. Otherwise, you'd have to update *every possible span ever* by hand to set all the values.

In [9]:
from spacy.tokens import Span

# Define getter function
def get_has_color(span):
    colors = ['red', 'yellow', 'blue']
    return any(token.text in colors for token in span)

# Set extension on the Span with getter
Span.set_extension('has_color', getter=get_has_color, force=True)

doc = nlp("The sky is blue.")
print(doc[1:4]._.has_color, '-', doc[1:4].text)
print(doc[0:2]._.has_color, '-', doc[0:2].text)

True - sky is blue
False - The sky


## Method extensions
* Assign a function that becomes available as an object method
* Lets you pass arguments to the extension function

In [10]:
from spacy.tokens import Doc

# Define method with arguments
def has_token(doc, token_text):
    return token_text in [token.text for token in doc] 

# Set extension on the Doc with method
Doc.set_extension('has_token', method=has_token, force=True)

doc = nlp("The sky is blue.")
print(doc._.has_token('blue'), '- blue')
print(doc._.has_token('cloud'), '- cloud')

True - blue
False - cloud


## Set extension attributes 1

In [11]:
# Register the Token extension attribute 'is_country' with the default value False
Token.set_extension('is_country', default=False, force=True)

# Process the text and set the is_country attribute to True for the token "Spain"
doc = nlp("I live in Spain.")
doc[3]._.is_country = True

# Print the token text and the is_country attribute for all tokens
print([(token.text, token._.is_country) for token in doc])

[('I', False), ('live', False), ('in', False), ('Spain', True), ('.', False)]


In [12]:
# Define the getter function that takes a token and returns its reversed text
def get_reversed(token):
    return token.text[::-1]
  
# Register the Token property extension 'reversed' with the getter get_reversed
Token.set_extension('reversed', getter=get_reversed, force=True)

# Process the text and print the reversed attribute for each token
doc = nlp("All generalizations are false, including this one.")
for token in doc:
    print('reversed:', token._.reversed)

reversed: llA
reversed: snoitazilareneg
reversed: era
reversed: eslaf
reversed: ,
reversed: gnidulcni
reversed: siht
reversed: eno
reversed: .


## Set extension attributes 2

In [13]:
# Define the getter function
def get_has_number(doc):
    # Return if any of the tokens in the doc return True for token.like_num
    return any(token.like_num for token in doc)

# Register the Doc property extension 'has_number' with the getter get_has_number
Doc.set_extension('has_number', getter=get_has_number, force=True)

# Process the text and check the custom has_number attribute 
doc = nlp("The museum closed for five years in 2012.")
print('has_number:', doc._.has_number)

has_number: True


In [14]:
# Define the method
def to_html(span, tag):
    # Wrap the span text in a HTML tag and return it
    return '<{tag}>{text}</{tag}>'.format(tag=tag, text=span.text)

# Register the Span property extension 'to_html' with the method to_html
Span.set_extension('to_html', method=to_html)

# Process the text and call the to_html method on the span with the tag name 'strong'
doc = nlp("Hello world, this is a sentence.")
span = doc[0:2]
print(span._.to_html('strong'))

<strong>Hello world</strong>


## Entities and extensions

In [15]:
nlp = spacy.load('en_core_web_lg')

def get_wikipedia_url(span):
    # Get a Wikipedia URL if the span has one of the labels
    if span.label_ in ('PERSON', 'ORG', 'GPE', 'LOCATION'):
        entity_text = span.text.replace(' ', '_')
        return "https://en.wikipedia.org/w/index.php?search=" + entity_text

# Set the Span extension wikipedia_url using get getter get_wikipedia_url
Span.set_extension('wikipedia_url', getter=get_wikipedia_url, force=True)

doc = nlp("In over fifty years from his very first recordings right through to his last album, David Bowie was at the vanguard of contemporary culture.")
for ent in doc.ents:
    # Print the text and Wikipedia URL of the entity
    print(ent.text, ent.label_, ent._.wikipedia_url)

over fifty years DATE None
first ORDINAL None
David Bowie PERSON https://en.wikipedia.org/w/index.php?search=David_Bowie


## Components with extensions

In [16]:
COUNTRIES = [
    "Afghanistan",
    "Åland Islands",
    "Albania",
    "Algeria",
    "American Samoa",
    "Andorra",
    "Angola",
    "Anguilla",
    "Antarctica",
    "Antigua and Barbuda",
    "Argentina",
    "Armenia",
    "Aruba",
    "Australia",
    "Austria",
    "Azerbaijan",
    "Bahamas",
    "Bahrain",
    "Bangladesh",
    "Barbados",
    "Belarus",
    "Belgium",
    "Belize",
    "Benin",
    "Bermuda",
    "Bhutan",
    "Bolivia (Plurinational State of)",
    "Bonaire, Sint Eustatius and Saba",
    "Bosnia and Herzegovina",
    "Botswana",
    "Bouvet Island",
    "Brasil",
    "British Indian Ocean Territory",
    "United States Minor Outlying Islands",
    "Virgin Islands (British)",
    "Virgin Islands (U.S.)",
    "Brunei Darussalam",
    "Bulgaria",
    "Burkina Faso",
    "Burundi",
    "Cambodia",
    "Cameroon",
    "Canada",
    "Cabo Verde",
    "Cayman Islands",
    "Central African Republic",
    "Chad",
    "Chile",
    "China",
    "Christmas Island",
    "Cocos (Keeling) Islands",
    "Colombia",
    "Comoros",
    "Congo",
    "Congo (Democratic Republic of the)",
    "Cook Islands",
    "Costa Rica",
    "Croatia",
    "Cuba",
    "Curaçao",
    "Cyprus",
    "Czech Republic",
    "Denmark",
    "Djibouti",
    "Dominica",
    "Dominican Republic",
    "Ecuador",
    "Egypt",
    "El Salvador",
    "Equatorial Guinea",
    "Eritrea",
    "Estonia",
    "Ethiopia",
    "Falkland Islands (Malvinas)",
    "Faroe Islands",
    "Fiji",
    "Finland",
    "France",
    "French Guiana",
    "French Polynesia",
    "French Southern Territories",
    "Gabon",
    "Gambia",
    "Georgia",
    "Germany",
    "Ghana",
    "Gibraltar",
    "Greece",
    "Greenland",
    "Grenada",
    "Guadeloupe",
    "Guam",
    "Guatemala",
    "Guernsey",
    "Guinea",
    "Guinea-Bissau",
    "Guyana",
    "Haiti",
    "Heard Island and McDonald Islands",
    "Holy See",
    "Honduras",
    "Hong Kong",
    "Hungary",
    "Iceland",
    "India",
    "Indonesia",
    "Côte d'Ivoire",
    "Iran (Islamic Republic of)",
    "Iraq",
    "Ireland",
    "Isle of Man",
    "Israel",
    "Italy",
    "Jamaica",
    "Japan",
    "Jersey",
    "Jordan",
    "Kazakhstan",
    "Kenya",
    "Kiribati",
    "Kuwait",
    "Kyrgyzstan",
    "Lao People's Democratic Republic",
    "Latvia",
    "Lebanon",
    "Lesotho",
    "Liberia",
    "Libya",
    "Liechtenstein",
    "Lithuania",
    "Luxembourg",
    "Macao",
    "Macedonia (the former Yugoslav Republic of)",
    "Madagascar",
    "Malawi",
    "Malaysia",
    "Maldives",
    "Mali",
    "Malta",
    "Marshall Islands",
    "Martinique",
    "Mauritania",
    "Mauritius",
    "Mayotte",
    "Mexico",
    "Micronesia (Federated States of)",
    "Moldova (Republic of)",
    "Monaco",
    "Mongolia",
    "Montenegro",
    "Montserrat",
    "Morocco",
    "Mozambique",
    "Myanmar",
    "Namibia",
    "Nauru",
    "Nepal",
    "Netherlands",
    "New Caledonia",
    "New Zealand",
    "Nicaragua",
    "Niger",
    "Nigeria",
    "Niue",
    "Norfolk Island",
    "Korea (Democratic People's Republic of)",
    "Northern Mariana Islands",
    "Norway",
    "Oman",
    "Pakistan",
    "Palau",
    "Palestine, State of",
    "Panama",
    "Papua New Guinea",
    "Paraguay",
    "Peru",
    "Philippines",
    "Pitcairn",
    "Poland",
    "Portugal",
    "Puerto Rico",
    "Qatar",
    "Republic of Kosovo",
    "Réunion",
    "Romania",
    "Russian Federation",
    "Rwanda",
    "Saint Barthélemy",
    "Saint Helena, Ascension and Tristan da Cunha",
    "Saint Kitts and Nevis",
    "Saint Lucia",
    "Saint Martin (French part)",
    "Saint Pierre and Miquelon",
    "Saint Vincent and the Grenadines",
    "Samoa",
    "San Marino",
    "Sao Tome and Principe",
    "Saudi Arabia",
    "Senegal",
    "Serbia",
    "Seychelles",
    "Sierra Leone",
    "Singapore",
    "Sint Maarten (Dutch part)",
    "Slovakia",
    "Slovenia",
    "Solomon Islands",
    "Somalia",
    "South Africa",
    "South Georgia and the South Sandwich Islands",
    "Korea (Republic of)",
    "South Sudan",
    "Spain",
    "Sri Lanka",
    "Sudan",
    "Suriname",
    "Svalbard and Jan Mayen",
    "Swaziland",
    "Sweden",
    "Switzerland",
    "Syrian Arab Republic",
    "Taiwan",
    "Tajikistan",
    "Tanzania, United Republic of",
    "Thailand",
    "Timor-Leste",
    "Togo",
    "Tokelau",
    "Tonga",
    "Trinidad and Tobago",
    "Tunisia",
    "Turkey",
    "Turkmenistan",
    "Turks and Caicos Islands",
    "Tuvalu",
    "Uganda",
    "Ukraine",
    "United Arab Emirates",
    "United Kingdom of Great Britain and Northern Ireland",
    "United States of America",
    "Uruguay",
    "Uzbekistan",
    "Vanuatu",
    "Venezuela (Bolivarian Republic of)",
    "Viet Nam",
    "Wallis and Futuna",
    "Western Sahara",
    "Yemen",
    "Zambia",
    "Zimbabwe",
]

In [17]:
matcher = PhraseMatcher(nlp.vocab)
# Create pattern Doc objects and add them to the matcher
# This is the faster version of: [nlp(country) for country in COUNTRIES]
patterns = nlp.pipe(COUNTRIES)
matcher.add('COUNTRY', patterns)

In [18]:
@Language.component('countries_component')
def countries_component(doc):
    # Create an entity Span with the label 'GPE' for all matches
    doc.ents = [Span(doc, start, end, label='GPE')
                for match_id, start, end in matcher(doc)]
    return doc

# Add the component to the pipeline
nlp.add_pipe('countries_component')
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner', 'countries_component']


In [19]:
capitals = {
    "Afghanistan": "Kabul",
    "Albania": "Tirana",
    "Algeria": "Algiers",
    "American Samoa": "Pago Pago",
    "Andorra": "Andorra la Vella",
    "Angola": "Luanda",
    "Anguilla": "The Valley",
    "Antarctica": "",
    "Antigua and Barbuda": "Saint John's",
    "Argentina": "Buenos Aires",
    "Armenia": "Yerevan",
    "Aruba": "Oranjestad",
    "Australia": "Canberra",
    "Austria": "Vienna",
    "Azerbaijan": "Baku",
    "Bahamas": "Nassau",
    "Bahrain": "Manama",
    "Bangladesh": "Dhaka",
    "Barbados": "Bridgetown",
    "Belarus": "Minsk",
    "Belgium": "Brussels",
    "Belize": "Belmopan",
    "Benin": "Porto-Novo",
    "Bermuda": "Hamilton",
    "Bhutan": "Thimphu",
    "Bolivia (Plurinational State of)": "Sucre",
    "Bonaire, Sint Eustatius and Saba": "Kralendijk",
    "Bosnia and Herzegovina": "Sarajevo",
    "Botswana": "Gaborone",
    "Bouvet Island": "",
    "Brazil": "Brasília",
    "British Indian Ocean Territory": "Diego Garcia",
    "Brunei Darussalam": "Bandar Seri Begawan",
    "Bulgaria": "Sofia",
    "Burkina Faso": "Ouagadougou",
    "Burundi": "Bujumbura",
    "Cabo Verde": "Praia",
    "Cambodia": "Phnom Penh",
    "Cameroon": "Yaoundé",
    "Canada": "Ottawa",
    "Cayman Islands": "George Town",
    "Central African Republic": "Bangui",
    "Chad": "N'Djamena",
    "Chile": "Santiago",
    "China": "Beijing",
    "Christmas Island": "Flying Fish Cove",
    "Cocos (Keeling) Islands": "West Island",
    "Colombia": "Bogotá",
    "Comoros": "Moroni",
    "Congo": "Brazzaville",
    "Congo (Democratic Republic of the)": "Kinshasa",
    "Cook Islands": "Avarua",
    "Costa Rica": "San José",
    "Croatia": "Zagreb",
    "Cuba": "Havana",
    "Curaçao": "Willemstad",
    "Cyprus": "Nicosia",
    "Czech Republic": "Prague",
    "Côte d'Ivoire": "Yamoussoukro",
    "Denmark": "Copenhagen",
    "Djibouti": "Djibouti",
    "Dominica": "Roseau",
    "Dominican Republic": "Santo Domingo",
    "Ecuador": "Quito",
    "Egypt": "Cairo",
    "El Salvador": "San Salvador",
    "Equatorial Guinea": "Malabo",
    "Eritrea": "Asmara",
    "Estonia": "Tallinn",
    "Ethiopia": "Addis Ababa",
    "Falkland Islands (Malvinas)": "Stanley",
    "Faroe Islands": "Tórshavn",
    "Fiji": "Suva",
    "Finland": "Helsinki",
    "France": "Paris",
    "French Guiana": "Cayenne",
    "French Polynesia": "Papeetē",
    "French Southern Territories": "Port-aux-Français",
    "Gabon": "Libreville",
    "Gambia": "Banjul",
    "Georgia": "Tbilisi",
    "Germany": "Berlin",
    "Ghana": "Accra",
    "Gibraltar": "Gibraltar",
    "Greece": "Athens",
    "Greenland": "Nuuk",
    "Grenada": "St. George's",
    "Guadeloupe": "Basse-Terre",
    "Guam": "Hagåtña",
    "Guatemala": "Guatemala City",
    "Guernsey": "St. Peter Port",
    "Guinea": "Conakry",
    "Guinea-Bissau": "Bissau",
    "Guyana": "Georgetown",
    "Haiti": "Port-au-Prince",
    "Heard Island and McDonald Islands": "",
    "Holy See": "Rome",
    "Honduras": "Tegucigalpa",
    "Hong Kong": "City of Victoria",
    "Hungary": "Budapest",
    "Iceland": "Reykjavík",
    "India": "New Delhi",
    "Indonesia": "Jakarta",
    "Iran (Islamic Republic of)": "Tehran",
    "Iraq": "Baghdad",
    "Ireland": "Dublin",
    "Isle of Man": "Douglas",
    "Israel": "Jerusalem",
    "Italy": "Rome",
    "Jamaica": "Kingston",
    "Japan": "Tokyo",
    "Jersey": "Saint Helier",
    "Jordan": "Amman",
    "Kazakhstan": "Astana",
    "Kenya": "Nairobi",
    "Kiribati": "South Tarawa",
    "Korea (Democratic People's Republic of)": "Pyongyang",
    "Korea (Republic of)": "Seoul",
    "Kuwait": "Kuwait City",
    "Kyrgyzstan": "Bishkek",
    "Lao People's Democratic Republic": "Vientiane",
    "Latvia": "Riga",
    "Lebanon": "Beirut",
    "Lesotho": "Maseru",
    "Liberia": "Monrovia",
    "Libya": "Tripoli",
    "Liechtenstein": "Vaduz",
    "Lithuania": "Vilnius",
    "Luxembourg": "Luxembourg",
    "Macao": "",
    "Macedonia (the former Yugoslav Republic of)": "Skopje",
    "Madagascar": "Antananarivo",
    "Malawi": "Lilongwe",
    "Malaysia": "Kuala Lumpur",
    "Maldives": "Malé",
    "Mali": "Bamako",
    "Malta": "Valletta",
    "Marshall Islands": "Majuro",
    "Martinique": "Fort-de-France",
    "Mauritania": "Nouakchott",
    "Mauritius": "Port Louis",
    "Mayotte": "Mamoudzou",
    "Mexico": "Mexico City",
    "Micronesia (Federated States of)": "Palikir",
    "Moldova (Republic of)": "Chișinău",
    "Monaco": "Monaco",
    "Mongolia": "Ulan Bator",
    "Montenegro": "Podgorica",
    "Montserrat": "Plymouth",
    "Morocco": "Rabat",
    "Mozambique": "Maputo",
    "Myanmar": "Naypyidaw",
    "Namibia": "Windhoek",
    "Nauru": "Yaren",
    "Nepal": "Kathmandu",
    "Netherlands": "Amsterdam",
    "New Caledonia": "Nouméa",
    "New Zealand": "Wellington",
    "Nicaragua": "Managua",
    "Niger": "Niamey",
    "Nigeria": "Abuja",
    "Niue": "Alofi",
    "Norfolk Island": "Kingston",
    "Northern Mariana Islands": "Saipan",
    "Norway": "Oslo",
    "Oman": "Muscat",
    "Pakistan": "Islamabad",
    "Palau": "Ngerulmud",
    "Palestine, State of": "Ramallah",
    "Panama": "Panama City",
    "Papua New Guinea": "Port Moresby",
    "Paraguay": "Asunción",
    "Peru": "Lima",
    "Philippines": "Manila",
    "Pitcairn": "Adamstown",
    "Poland": "Warsaw",
    "Portugal": "Lisbon",
    "Puerto Rico": "San Juan",
    "Qatar": "Doha",
    "Republic of Kosovo": "Pristina",
    "Romania": "Bucharest",
    "Russian Federation": "Moscow",
    "Rwanda": "Kigali",
    "Réunion": "Saint-Denis",
    "Saint Barthélemy": "Gustavia",
    "Saint Helena, Ascension and Tristan da Cunha": "Jamestown",
    "Saint Kitts and Nevis": "Basseterre",
    "Saint Lucia": "Castries",
    "Saint Martin (French part)": "Marigot",
    "Saint Pierre and Miquelon": "Saint-Pierre",
    "Saint Vincent and the Grenadines": "Kingstown",
    "Samoa": "Apia",
    "San Marino": "City of San Marino",
    "Sao Tome and Principe": "São Tomé",
    "Saudi Arabia": "Riyadh",
    "Senegal": "Dakar",
    "Serbia": "Belgrade",
    "Seychelles": "Victoria",
    "Sierra Leone": "Freetown",
    "Singapore": "Singapore",
    "Sint Maarten (Dutch part)": "Philipsburg",
    "Slovakia": "Bratislava",
    "Slovenia": "Ljubljana",
    "Solomon Islands": "Honiara",
    "Somalia": "Mogadishu",
    "South Africa": "Pretoria",
    "South Georgia and the South Sandwich Islands": "King Edward Point",
    "South Sudan": "Juba",
    "Spain": "Madrid",
    "Sri Lanka": "Colombo",
    "Sudan": "Khartoum",
    "Suriname": "Paramaribo",
    "Svalbard and Jan Mayen": "Longyearbyen",
    "Swaziland": "Lobamba",
    "Sweden": "Stockholm",
    "Switzerland": "Bern",
    "Syrian Arab Republic": "Damascus",
    "Taiwan": "Taipei",
    "Tajikistan": "Dushanbe",
    "Tanzania, United Republic of": "Dodoma",
    "Thailand": "Bangkok",
    "Timor-Leste": "Dili",
    "Togo": "Lomé",
    "Tokelau": "Fakaofo",
    "Tonga": "Nuku'alofa",
    "Trinidad and Tobago": "Port of Spain",
    "Tunisia": "Tunis",
    "Turkey": "Ankara",
    "Turkmenistan": "Ashgabat",
    "Turks and Caicos Islands": "Cockburn Town",
    "Tuvalu": "Funafuti",
    "Uganda": "Kampala",
    "Ukraine": "Kiev",
    "United Arab Emirates": "Abu Dhabi",
    "United Kingdom of Great Britain and Northern Ireland": "London",
    "United States Minor Outlying Islands": "",
    "United States of America": "Washington, D.C.",
    "Uruguay": "Montevideo",
    "Uzbekistan": "Tashkent",
    "Vanuatu": "Port Vila",
    "Venezuela (Bolivarian Republic of)": "Caracas",
    "Viet Nam": "Hanoi",
    "Virgin Islands (British)": "Road Town",
    "Virgin Islands (U.S.)": "Charlotte Amalie",
    "Wallis and Futuna": "Mata-Utu",
    "Western Sahara": "El Aaiún",
    "Yemen": "Sana'a",
    "Zambia": "Lusaka",
    "Zimbabwe": "Harare",
    "Åland Islands": "Mariehamn",
}

# Getter that looks up the span text in the dictionary of country capitals
get_capital = lambda span: capitals.get(span.text)

def get_capital(span):
    return capitals.get(span.text)

In [20]:
# Register the Span extension attribute 'capital' with the getter get_capital 
Span.set_extension('capital', getter=get_capital, force=True)

In [21]:
# Process the text and print the entity text, label and capital attributes
doc = nlp("Czech Republic may help Slovakia protect its airspace")
print([(ent.text, ent.label_, ent._.capital) for ent in doc.ents])

[('Czech Republic', 'GPE', 'Prague'), ('Slovakia', 'GPE', 'Bratislava')]


# Scaling and performance

## Processing large volumes of text
* Use nlp.pipe method
* Processes texts as a stream, yields Doc objects
* Much faster than calling nlp on each text

BAD:  
```docs = [nlp(text) for text in LOTS_OF_TEXTS]```   
GOOD:  
```docs = list(nlp.pipe(LOTS_OF_TEXTS)```

## Passing in context
* Setting as_tuples=True on nlp.pipe lets you pass in (text, context) tuples
* Yields (doc, context) tuples
* Useful for associating metadata with the doc

In [22]:
data = [
    ("This is a text", {"id": 1, "page_number": 15}),
    ("And another text", {"id": 2, "page_number": 16}),
]
for doc, context in nlp.pipe(data, as_tuples=True):
    print(doc.text, 'id', context['id'], 'at page', context["page_number"])

This is a text id 1 at page 15
And another text id 2 at page 16


## Add the context meta data to custom attributes

In [23]:
Doc.set_extension("id", default=None, force=True)
Doc.set_extension("page_number", default=None, force=True)

data = [
    ("This is a text", {"id": 1, "page_number": 15}),
    ("And another text", {"id": 2, "page_number": 16}),
]

for doc, context in nlp.pipe(data, as_tuples=True):
    doc._.id = context["id"]
    doc._.page_number = context["page_number"]
    print(doc.text, 'id', context['id'], 'at page', context["page_number"])

This is a text id 1 at page 15
And another text id 2 at page 16


## Using only the tokenizer
Use `nlp.make_doc` to turn a text in to a Doc object  
BAD:  
```doc = nlp("Hello world"```  
GOOD:  
```doc = nlp.make_doc("Hello world!")```

## Disabling pipeline components
Use `nlp.disable_pipes` to temporarily disable one or more pipeline components

In [29]:
nlp = spacy.load('en_core_web_lg')
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7ff3d7e42b80>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7ff3d7e42ac0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7ff3d7e77d60>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7ff3d8789a40>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7ff3d8827500>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7ff3d7e77c80>)]

In [32]:
# Disable tagger and parser
with nlp.disable_pipes('tagger', 'parser'):
    # Process the text and print the entities
    doc = nlp('made in Brazil')
    print(doc.ents)

(Brazil,)


* restores them after the `with` block
* only runs the remaining components

## Processing streams 1

In [33]:
nlp = spacy.load("en_core_web_lg")
TEXTS = [
    "McDonalds is my favorite restaurant.",
    "Here I thought @McDonalds only had precooked burgers but it seems they only have not cooked ones?? I have no time to get sick..",
    "People really still eat McDonalds :(",
    "The McDonalds in Spain has chicken wings. My heart is so happy ",
    "@McDonalds Please bring back the most delicious fast food sandwich of all times!!....The Arch Deluxe :P",
    "please hurry and open. I WANT A #McRib SANDWICH SO BAD! :D",
    "This morning i made a terrible decision by gettin mcdonalds and now my stomach is payin for it",
]

In [35]:
# BAD: Process the texts and print the adjectives
for text in TEXTS:
    doc = nlp(text)
    print([token.text for token in doc if token.pos_ == 'ADJ'])

['favorite']
['sick']
[]
['happy']
['delicious', 'fast']
['open', 'BAD']
['terrible', 'payin']


In [36]:
# GOOD: Process the texts and print the adjectives
for doc in nlp.pipe(TEXTS):
    print([token.text for token in doc if token.pos_ == 'ADJ'])

['favorite']
['sick']
[]
['happy']
['delicious', 'fast']
['open', 'BAD']
['terrible', 'payin']


## Processing streams 1

In [38]:
# Process the texts and print the entities
docs = list(nlp.pipe(TEXTS))
entities = [doc.ents for doc in docs]
print(*entities)

(McDonalds,) (@McDonalds,) (McDonalds,) (McDonalds, Spain) (@McDonalds,) () (This morning, mcdonalds)


In [40]:
people = ['David Bowie', 'Angela Merkel', 'Lady Gaga']
# Create a list of patterns for the PhraseMatcher
patterns = list(nlp.pipe(people))
patterns

[David Bowie, Angela Merkel, Lady Gaga]

## Processing data with context

In [42]:
DATA = [
    (
        "One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin.",
        {"author": "Franz Kafka", "book": "Metamorphosis"},
    ),
    (
        "I know not all that may be coming, but be it what it will, I'll go to it laughing.",
        {"author": "Herman Melville", "book": "Moby-Dick or, The Whale"},
    ),
    (
        "It was the best of times, it was the worst of times.",
        {"author": "Charles Dickens", "book": "A Tale of Two Cities"},
    ),
    (
        "The only people for me are the mad ones, the ones who are mad to live, mad to talk, mad to be saved, desirous of everything at the same time, the ones who never yawn or say a commonplace thing, but burn, burn, burn like fabulous yellow roman candles exploding like spiders across the stars.",
        {"author": "Jack Kerouac", "book": "On the Road"},
    ),
    (
        "It was a bright cold day in April, and the clocks were striking thirteen.",
        {"author": "George Orwell", "book": "1984"},
    ),
    (
        "Nowadays people know the price of everything and the value of nothing.",
        {"author": "Oscar Wilde", "book": "The Picture Of Dorian Gray"},
    ),
]

In [43]:
# Register the Doc extension 'author' (default None)
Doc.set_extension('author', default=None, force=True)

# Register the Doc extension 'book' (default None)
Doc.set_extension('book', default=None, force=True)

In [45]:
# Import the Doc class and register the extensions 'author' and 'book'
from spacy.tokens import Doc
Doc.set_extension('book', default=None, force=True)
Doc.set_extension('author', default=None, force=True)

for doc, context in nlp.pipe(DATA, as_tuples=True):
    # Set the doc._.book and doc._.author attributes from the context
    doc._.book = context['book']
    doc._.author = context['author']
    
    # Print the text and custom attribute data
    print(doc.text, '\n', "— '{}' by {}".format(doc._.book, doc._.author), '\n')

One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin. 
 — 'Metamorphosis' by Franz Kafka 

I know not all that may be coming, but be it what it will, I'll go to it laughing. 
 — 'Moby-Dick or, The Whale' by Herman Melville 

It was the best of times, it was the worst of times. 
 — 'A Tale of Two Cities' by Charles Dickens 

The only people for me are the mad ones, the ones who are mad to live, mad to talk, mad to be saved, desirous of everything at the same time, the ones who never yawn or say a commonplace thing, but burn, burn, burn like fabulous yellow roman candles exploding like spiders across the stars. 
 — 'On the Road' by Jack Kerouac 

It was a bright cold day in April, and the clocks were striking thirteen. 
 — '1984' by George Orwell 

Nowadays people know the price of everything and the value of nothing. 
 — 'The Picture Of Dorian Gray' by Oscar Wilde 



## Selective processing

In [46]:
text = "Chick-fil-A is an American fast food restaurant chain headquartered in the city of College Park, Georgia, specializing in chicken sandwiches."

# Only tokenize the text
doc = nlp.make_doc(text)

print([token.text for token in doc])

['Chick', '-', 'fil', '-', 'A', 'is', 'an', 'American', 'fast', 'food', 'restaurant', 'chain', 'headquartered', 'in', 'the', 'city', 'of', 'College', 'Park', ',', 'Georgia', ',', 'specializing', 'in', 'chicken', 'sandwiches', '.']


In [47]:
text = "Chick-fil-A is an American fast food restaurant chain headquartered in the city of College Park, Georgia, specializing in chicken sandwiches."

# Disable the tagger and parser
with nlp.disable_pipes('tagger','parser'):
    # Process the text
    doc = nlp(text)
    # Print the entities in the doc
    print(doc.ents)

(Chick-fil-A, American, College Park, Georgia)


In [None]:
Selective processingSelective processing