In [None]:
# Processing Pipelines

# Internal Workings of doc = nlp('hello strings')
# Built in pipeline components

# token.tag, token.pos
# token.dep, token.head, Doc.sents, Doc.noun_chunks
# doc.ents, doc.ent_iob, doc.ent_type
# doc.cats -> text classifier

In [3]:
# from spacy.lang.en import English
import spacy
nlp = spacy.load('en_core_web_lg')

nlp.pipe_names

['tagger', 'parser', 'ner']

In [4]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7fb6095782d0>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7fb6083ecad0>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7fb6083eca60>)]

In [5]:
#  Adding custom components

# def custom_component:
    #do something here     
#     return doc

# nlp.add_pipe(custom_component)

# add_pipe optional parameters first=True, last=True(default) , before='ner',after = 'ner'

In [6]:
import spacy

# Define the custom component
def length_component(doc):
    # Get the doc's length
    doc_length = len(doc)
    print(f"This document is {doc_length} tokens long.")
    # Return the doc
    return doc


# Load the small English model
nlp = spacy.load("en_core_web_sm")

# Add the component first in the pipeline and print the pipe names
nlp.add_pipe(length_component, first=True)
print(nlp.pipe_names)

# Process a text
doc = nlp("This is a sentence.")

['length_component', 'tagger', 'parser', 'ner']
This document is 5 tokens long.


In [8]:
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
nlp = spacy.load("en_core_web_sm")
animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]
animal_patterns = list(nlp.pipe(animals))
print("animal_patterns:", animal_patterns)
matcher = PhraseMatcher(nlp.vocab)
matcher.add("ANIMAL", None, *animal_patterns)

# Define the custom component
def animal_component(doc):
    # Apply the matcher to the doc
    matches = matcher(doc)
    # Create a Span for each match and assign the label "ANIMAL"
    spans = [Span(doc, start, end, label="ANIMAL") for match_id, start, end in matches]
    # Overwrite the doc.ents with the matched spans
    doc.ents = spans
    return doc

# Add the component to the pipeline after the "ner" component
nlp.add_pipe(animal_component, after="ner")
print(nlp.pipe_names)

# Process the text and print the text and label for the doc.ents
doc = nlp("I have a cat and a Golden Retriever")
print([(ent.text, ent.label_) for ent in doc.ents])

animal_patterns: [Golden Retriever, cat, turtle, Rattus norvegicus]
['tagger', 'parser', 'ner', 'animal_component']
[('cat', 'ANIMAL'), ('Golden Retriever', 'ANIMAL')]


In [9]:
# Custom attributes and Property extensions

from spacy.lang.en import English
from spacy.tokens import Token

nlp = English()

# Register the Token extension attribute "is_country" with the default value False
Token.set_extension("is_country", default=False)

# Process the text and set the is_country attribute to True for the token "Spain"
doc = nlp("I live in Spain.")
doc[3]._.is_country = True

# Print the token text and the is_country attribute for all tokens
print([(token.text, token._.is_country) for token in doc])

[('I', False), ('live', False), ('in', False), ('Spain', True), ('.', False)]


In [11]:
from spacy.lang.en import English
from spacy.tokens import Token

nlp = English()
# Define the getter function that takes a token and returns its reversed text
def get_reversed(token):
    return token.text[::-1]
# Register the Token property extension "reversed" with the getter get_reversed
Token.set_extension("reversed", getter=get_reversed)

# Process the text and print the reversed attribute for each token
doc = nlp("All generalizations are false, including this one.")
for token in doc:
    print("reversed:", token._.reversed)

reversed: llA
reversed: snoitazilareneg
reversed: era
reversed: eslaf
reversed: ,
reversed: gnidulcni
reversed: siht
reversed: eno
reversed: .


In [12]:
from spacy.lang.en import English
from spacy.tokens import Doc

nlp = English()

# Define the getter function
def get_has_number(doc):
    # Return if any of the tokens in the doc return True for token.like_num
    return any(token.like_num for token in doc)


# Register the Doc property extension "has_number" with the getter get_has_number
Doc.set_extension("has_number", getter=get_has_number)

# Process the text and check the custom has_number attribute
doc = nlp("The museum closed for five years in 2012.")
print("has_number:", doc._.has_number)

has_number: True


In [None]:
# Scaling and Performance

# docs = list(nlp.pipe(LOTS_OF_TE))
# only tokenizer
# nlp.make_doc



In [None]:
# reading json data
# import json
# with open("exercises/en/tweets.json") as f:
#     TEXTS = json.loads(f.read())

In [13]:
import spacy

nlp = spacy.load("en_core_web_sm")
text = (
    "Chick-fil-A is an American fast food restaurant chain headquartered in "
    "the city of College Park, Georgia, specializing in chicken sandwiches."
)
# Disable the tagger and parser
with nlp.disable_pipes("tagger", "parser"):
    # Process the text
    doc = nlp(text)
    # Print the entities in the doc
    print(doc.ents)

SyntaxError: invalid character in identifier (<ipython-input-13-6767b5109d0c>, line 2)