In [2]:
import spacy

In [3]:
nlp = spacy.blank("en")

doc = nlp("Captain America ate 100$ of samosas. Then he said I can do it all day.")

for token in doc:
    print(token)

Captain
America
ate
100
$
of
samosas
.
Then
he
said
I
can
do
it
all
day
.


In [4]:
nlp.pipe_names  # no components in the pipeline yet

[]

In [5]:
import sys
!{sys.executable} -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ----- ---------------------------------- 1.8/12.8 MB 8.5 MB/s eta 0:00:02
     ------------ --------------------------- 3.9/12.8 MB 9.1 MB/s eta 0:00:01
     ------------------------ --------------- 7.9/12.8 MB 12.7 MB/s eta 0:00:01
     ------------------------------------- - 12.3/12.8 MB 14.5 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 14.5 MB/s  0:00:01
[38;5;2mâœ” Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
nlp = spacy.load("en_core_web_sm")

In [7]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [8]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x1bcbc8d07d0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x1bcbc8d0a70>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x1bcbc763df0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x1bc9bdd8fd0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x1bcbc900dd0>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x1bcbc5c91c0>)]

In [9]:
doc = nlp("Captain America ate 100$ of samosas. Then he said I can do it all day.")

for token in doc:
    print(token, " | ", token.pos_, " | ", token.lemma_)

Captain  |  PROPN  |  Captain
America  |  PROPN  |  America
ate  |  VERB  |  eat
100  |  NUM  |  100
$  |  NUM  |  $
of  |  ADP  |  of
samosas  |  NOUN  |  samosa
.  |  PUNCT  |  .
Then  |  ADV  |  then
he  |  PRON  |  he
said  |  VERB  |  say
I  |  PRON  |  I
can  |  AUX  |  can
do  |  VERB  |  do
it  |  PRON  |  it
all  |  DET  |  all
day  |  NOUN  |  day
.  |  PUNCT  |  .


In [10]:
nlp = spacy.blank("en")
doc = nlp("Captain America ate 100$ of samosas. Then he said I can do it all day.")

for token in doc:
    print(token, " | ", token.pos_, " | ", token.lemma_)     

# the pipeline is blank so it gives nothing

Captain  |    |  
America  |    |  
ate  |    |  
100  |    |  
$  |    |  
of  |    |  
samosas  |    |  
.  |    |  
Then  |    |  
he  |    |  
said  |    |  
I  |    |  
can  |    |  
do  |    |  
it  |    |  
all  |    |  
day  |    |  
.  |    |  


In [11]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Tesla Inc is going to acquire twitter for $45 Billion.")
for ent in doc.ents:
    print(ent, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
$45 Billion  |  MONEY  |  Monetary values, including unit


In [12]:
from spacy import displacy

displacy.render(doc, style ="ent")

In [13]:
doc = nlp("Apple hired Tim Cook in 2011 for $1 million in California.")

for ent in doc.ents:
    print(ent, " | ", ent.label_, " | ", spacy.explain(ent.label_))

displacy.render(doc, style="ent")

Apple  |  ORG  |  Companies, agencies, institutions, etc.
Tim Cook  |  PERSON  |  People, including fictional
2011  |  DATE  |  Absolute or relative dates or periods
$1 million  |  MONEY  |  Monetary values, including unit
California  |  GPE  |  Countries, cities, states


In [14]:
doc = nlp("Michael Bloomber founded a data company called Bloomberg")
displacy.render(doc, style="ent")

In [15]:
# to add only some of the components of the pipeline
source_nlp = spacy.load("en_core_web_sm")

nlp = spacy.blank("en")
nlp.add_pipe("ner", source=source_nlp)  # from the source nlp pipeline, add pipeline ner
nlp.pipe_names

['ner']

In [16]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 Billion.")
for ent in doc.ents:
    print(ent, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Tesla Inc  |  ORG  |  Companies, agencies, institutions, etc.
$45 Billion  |  MONEY  |  Monetary values, including unit
