In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [4]:
print(nlp.pipeline)

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x7f79c7be9ee0>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x7f79c7be9dc0>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x7f79c7bf52e0>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x7f79c7a05d00>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x7f79c7995400>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x7f79c7bf5270>)]


In [5]:
# Creating our own custom component in the nlp pipeline
from spacy.language import Language

@Language.component('length_of_doc')
def length_comp(doc):
    doc_length = len(doc)
    print(f'This component contains {doc_length} tokens.')
    return doc

nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('length_of_doc', first=True)
print(nlp.pipe_names)

doc = nlp('This is a sentence.')

['length_of_doc', 'tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
This component contains 5 tokens.


In [6]:
from spacy.language import Language
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

nlp = spacy.load('en_core_web_sm')
doc = nlp("I have a cat and a Golden Retriever")
print(doc.ents)
animals = ["Golden Retriever", "cat", "Turtle", "Rattus norvegicus"]
animal_patterns = list(nlp.pipe(animals))
matcher = PhraseMatcher(nlp.vocab)
matcher.add('ANIMALS', animal_patterns)

@Language.component('extract_animal')
def animal_component(doc):
    matches = matcher(doc)
    spans = [Span(doc, start, end, label='ANIMAL') for match_id, start, end in matches]
    doc.ents = spans
    return doc

()


In [7]:
nlp.add_pipe('extract_animal', after='ner')
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner', 'extract_animal']


In [8]:
doc = nlp("I have a cat and a Golden Retriever")
doc.ents

(cat, Golden Retriever)

In [9]:
print([(ent.text, ent.label_) for ent in doc.ents])

[('cat', 'ANIMAL'), ('Golden Retriever', 'ANIMAL')]


In [10]:
print([(ent.text, ent.label) for ent in doc.ents])

[('cat', 6303828839600189595), ('Golden Retriever', 6303828839600189595)]


In [11]:
# Setting extension attributes
from spacy.tokens import Token, Span, Doc

nlp = spacy.blank('en')
Token.set_extension("is_country", default=False)

In [12]:
doc = nlp('I live in Spain.')
doc[3]._.is_country = True

print([(token.text, token._.is_country) for token in doc])

[('I', False), ('live', False), ('in', False), ('Spain', True), ('.', False)]


In [13]:
# Setting property extension
def get_reversed_token(token):
    return token.text[::-1]

In [14]:
Token.set_extension('reversed', getter=get_reversed_token, force=True)
doc = nlp("All generalizations are false, including this one.")
print([(token.text, token._.reversed) for token in doc])

[('All', 'llA'), ('generalizations', 'snoitazilareneg'), ('are', 'era'), ('false', 'eslaf'), (',', ','), ('including', 'gnidulcni'), ('this', 'siht'), ('one', 'eno'), ('.', '.')]


In [15]:
def get_has_number(doc):
    return any(token.like_num for token in doc)

In [16]:
Doc.set_extension('has_num', getter=get_has_number)
doc = nlp("The museum closed for five years in 2012.")
print(doc._.has_num)

True


In [17]:
def to_html(span, tag):
    return f"<{tag}>{span.text}</{tag}>"

Span.set_extension('to_html', method=to_html,force=True)
doc = nlp("Hello world, this is a sentence.")
span = doc[0:2]
print(span._.to_html('strong'))

<strong>Hello world</strong>


In [18]:
nlp = spacy.load('en_core_web_sm')

def get_wikipedia_url(span):
    if span.label_ in ('PERSON', 'ORG', 'GPE', 'LOCATION'):
        entity_text = span.text.replace(' ', '_')
        return "https://en.wikipedia.org/w/index.php?search=" + entity_text

In [19]:
Span.set_extension('get_url', getter=get_wikipedia_url,force=True)
doc = nlp(
    "In over fifty years from his very first recordings right through to his "
    "last album, David Bowie was at the vanguard of contemporary culture."
)

for ent in doc.ents:
    print(ent.text, ent._.get_url)

over fifty years None
David Bowie https://en.wikipedia.org/w/index.php?search=David_Bowie


In [20]:
import json

with open('./countries.json', encoding='utf8') as file:
    COUNTRIES = json.loads(file.read())
with open('./capitals.json', encoding = 'utf8') as file:
    CAPITALS = json.loads(file.read())

In [21]:
nlp = spacy.load('en_core_web_sm')
matcher = PhraseMatcher(nlp.vocab)
matcher.add('COUNTRY', list(nlp.pipe(COUNTRIES)))

@Language.component('countries_comp')
def countries_func(doc):
    matches = matcher(doc)
    doc.ents = [Span(doc, start, end, label='GPE') for match_id, start, end in matches]
    return doc

nlp.add_pipe('countries_comp', after='ner')
print(nlp.pipe_names)

# Getter that looks up the span text in the dictionary of country capitals
def get_capital(span):
    return CAPITALS.get(span.text)

# Register the Span extension attribute "capital" with the getter get_capital
Span.set_extension("capital", getter=get_capital, force=True)

# Process the text and print the entity text, label and capital attributes
doc = nlp("Czech Republic may help Slovakia protect its airspace")
print([(ent.text, ent.label_, ent._.capital) for ent in doc.ents])

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner', 'countries_comp']
[('Czech Republic', 'GPE', 'Prague'), ('Slovakia', 'GPE', 'Bratislava')]


In [22]:
import json
nlp = spacy.load('en_core_web_sm')
with open('./tweets.json',encoding='utf8') as file:
    TEXTS = json.loads(file.read())
    
docs = list(nlp.pipe(TEXTS))
docs

[McDonalds is my favorite restaurant.,
 Here I thought @McDonalds only had precooked burgers but it seems they only have not cooked ones?? I have no time to get sick..,
 People really still eat McDonalds :(,
 The McDonalds in Spain has chicken wings. My heart is so happy ,
 @McDonalds Please bring back the most delicious fast food sandwich of all times!!....The Arch Deluxe :P,
 please hurry and open. I WANT A #McRib SANDWICH SO BAD! :D,
 This morning i made a terrible decision by gettin mcdonalds and now my stomach is payin for it]

In [23]:
for doc in docs:
    print([token.text for token in doc if token.pos_ == 'ADJ'])

['favorite']
['sick']
[]
['happy']
['delicious', 'fast']
['open']
['terrible']


In [24]:
for doc in docs:
    print([ent.text for ent in doc.ents])

[]
[]
['McDonalds']
['McDonalds', 'Spain']
['times!!']
['McRib SANDWICH SO']
['This morning']


In [25]:
people = ['David Bowie', 'Angela Merkel', 'Lady Gaga']
patterns = list(nlp.pipe(people))
patterns

[David Bowie, Angela Merkel, Lady Gaga]

In [26]:
with open('./bookquotes.json') as file:
    QUOTES = json.loads(file.read())
    
nlp = spacy.blank('en')
Doc.set_extension('Author',default=None,force=True)
Doc.set_extension('Book',default=None,force=True)

for doc, context in nlp.pipe(QUOTES,as_tuples=True):
    doc._.Author = context['author']
    doc._.Book = context['book']
    print(f'{doc.text}\n -{doc._.Book} by {doc._.Author}\n')

One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin.
 -Metamorphosis by Franz Kafka

I know not all that may be coming, but be it what it will, I'll go to it laughing.
 -Moby-Dick or, The Whale by Herman Melville

It was the best of times, it was the worst of times.
 -A Tale of Two Cities by Charles Dickens

The only people for me are the mad ones, the ones who are mad to live, mad to talk, mad to be saved, desirous of everything at the same time, the ones who never yawn or say a commonplace thing, but burn, burn, burn like fabulous yellow roman candles exploding like spiders across the stars.
 -On the Road by Jack Kerouac

It was a bright cold day in April, and the clocks were striking thirteen.
 -1984 by George Orwell

Nowadays people know the price of everything and the value of nothing.
 -The Picture Of Dorian Gray by Oscar Wilde



In [27]:
nlp = spacy.load('en_core_web_sm')
text = (
    "Chick-fil-A is an American fast food restaurant chain headquartered in "
    "the city of College Park, Georgia, specializing in chicken sandwiches."
)


doc = nlp.make_doc(text) # make_doc method processes the text given into doc by only tokenizing. Here other pipeline 
                         # components are not involved while converting.

print([token for token in doc])

[Chick, -, fil, -, A, is, an, American, fast, food, restaurant, chain, headquartered, in, the, city, of, College, Park, ,, Georgia, ,, specializing, in, chicken, sandwiches, .]


In [28]:
with nlp.select_pipes(disable=['tagger', 'lemmatizer']):    
    doc = nlp(text)
    print(doc.ents)

(Chick, American, College Park, Georgia)


In [29]:
from spacy.matcher import Matcher

TEXTS = ['How to preorder the iPhone X', 'iPhone X is coming', 'Should I pay $1,000 for the iPhone X?', 'The iPhone 8 reviews are here', "iPhone 11 vs iPhone 8: What's the difference?", 'I need a new phone! Any tips?']

nlp = spacy.blank('en')

pattern1 = [{'LOWER':'iphone'}, {'LOWER':'x'}]
pattern2 = [{'LOWER':'iphone'}, {'IS_DIGIT':True}]
matcher = Matcher(nlp.vocab)
matcher.add('GADGET', patterns=[pattern1, pattern2])
docs = list(nlp.pipe(TEXTS))

for doc in nlp.pipe(TEXTS):
    matches = matcher(doc)
    spans = [Span(doc, start, end, label='GADGET') for match_id, start, end in matches]
    doc.ents = spans
    print(doc.ents)

(iPhone X,)
(iPhone X,)
(iPhone X,)
(iPhone 8,)
(iPhone 11, iPhone 8)
()


In [30]:
from spacy.tokens import DocBin

doc_bin = DocBin(docs=docs)
doc_bin.to_disk('./train.spacy')

In [31]:
!python3 -m spacy init config ./config.cfg --lang en --pipeline ner


[38;5;1m✘ The provided output file already exists. To force overwriting the
config file, set the --force or -F flag.[0m



In [32]:
!cat ./config.cfg

[paths]
train = null
dev = null
vectors = null
init_tok2vec = null

[system]
gpu_allocator = null
seed = 0

[nlp]
lang = "en"
pipeline = ["tok2vec","ner"]
batch_size = 1000
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}

[components]

[components.ner]
factory = "ner"
incorrect_spans_key = null
moves = null
scorer = {"@scorers":"spacy.ner_scorer.v1"}
update_with_oracle_cut_size = 100

[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = true
nO = null

[components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
upstream = "*"

[components.tok2vec]
factory = "tok2vec"

[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"

[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = ${compon

In [33]:
!python -m spacy train ./config_gadget.cfg --output ./output --paths.train train_gadget.spacy --paths.dev dev_gadget.spacy

[38;5;2m✔ Created output directory: output[0m
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[2022-07-11 11:14:04,182] [INFO] Set up nlp object from config
[2022-07-11 11:14:04,192] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-07-11 11:14:04,196] [INFO] Created vocabulary
[2022-07-11 11:14:04,197] [INFO] Finished initializing nlp object
[2022-07-11 11:14:04,979] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     20.33    1.69    1.04    4.44    0.02
  1     200         29.35    993.16   76.92   76.09   77.78    0.77
  2     400         72.34    248.20   83.70   81.91   85.56    0.84
  4     600         61.40    122.91   82.87   82.42   83.33    0.83
  6     80

In [34]:
nlp = spacy.load('./output/model-best/')

In [35]:
testing_texts = ["Apple is slowing down the iPhone 8 and iPhone X - how to stop it", "I finally understand what the iPhone X ‘notch’ is for","Everything you need to know about the Samsung Galaxy S9","Looking to compare iPad models? Here’s how the 2018 lineup stacks up",
"The iPhone 8 and iPhone 8 Plus are smartphones designed, developed, and marketed by Apple",
"what is the cheapest ipad, especially ipad pro???",
"Samsung Galaxy is a series of mobile computing devices designed, manufactured and marketed by Samsung Electronics"]

In [36]:
docs = nlp.pipe(testing_texts)

In [37]:
for doc in docs:
    print(doc.ents)

(iPhone 8, iPhone X)
(iPhone X, ‘notch)
(Samsung Galaxy S9,)
(iPad,)
(iPhone 8, iPhone 8 Plus)
(ipad, ipad pro)
(Samsung Galaxy, Samsung Electronics)


In [43]:
nlp = spacy.blank('en')

doc1 = nlp("i went to amsterdem last year and the canals were beautiful")
doc1.ents = [Span(doc1, 3, 4, label="GPE")]

doc2 = nlp("You should visit Paris once, but the Eiffel Tower is kinda boring")
doc2.ents = [Span(doc2, 3, 4, label="GPE")]

doc3 = nlp("There's also a Paris in Arkansas, lol")
doc3.ents = [Span(doc3, 4, 5, label='GPE'), Span(doc3, 6, 7, label = 'GPE')]

doc4 = nlp("Berlin is perfect for summer holiday: great nightlife and cheap beer!")
doc4.ents = [Span(doc4, 0, 1, label="GPE")]

In [44]:
docs = [doc1, doc2, doc3, doc4]
for doc in docs:
    print(doc.ents)

(amsterdem,)
(Paris,)
(Paris, Arkansas)
(Berlin,)


In [48]:
nlp = spacy.blank('en')

doc1 = nlp("Reddit partners with Patreon to help creators build communities")
doc1.ents = (Span(doc1, 0, 1, label='WEBSITE'), Span(doc1, 3, 4, label='WEBSITE'))
doc1.ents

(Reddit, Patreon)

In [49]:
doc2 = nlp("PewDiePie smashes YouTube record")
doc2.ents = [Span(doc2, 0, 1, label='PERSON'), Span(doc2, 2, 3, label="WEBSITE")]
for ent in doc2.ents:
    print(ent.text, ent.label_)

PewDiePie PERSON
YouTube WEBSITE


In [50]:
doc3 = nlp("Reddit founder Alexis Ohanian gave away two Metallica tickets to fans")
doc3.ents = [Span(doc3, 2, 4, label='PERSON'), Span(doc3, 0, 1, label="WEBSITE")]
for ent in doc3.ents:
    print(ent.text, ent.label_)

Reddit WEBSITE
Alexis Ohanian PERSON
