<h1 style="text-align:center; color:mediumvioletred">Spacy Pipeline</h1>

In [1]:
import spacy

In [2]:
nlp = spacy.blank("en")

doc = nlp("Captain America ate 100$ of samosa. Then he said I can do this all day.")

for token in doc:
    print(token)

Captain
America
ate
100
$
of
samosa
.
Then
he
said
I
can
do
this
all
day
.


In [3]:
nlp.pipe_names

[]

In [4]:
nlp = spacy.load("en_core_web_md")

In [5]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x2462d0aada0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x2462e79c940>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x2462fc62490>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x2462fdefec0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x2462fe4df80>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x2462fc62650>)]

In [6]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [7]:
doc = nlp("Captain America ate 100$ of samosa. Then he said I can do this all day.")

for token in doc:
    print(token, "|", token.pos_, "|", token.lemma_)

Captain | PROPN | Captain
America | PROPN | America
ate | VERB | eat
100 | NUM | 100
$ | NOUN | $
of | ADP | of
samosa | NOUN | samosa
. | PUNCT | .
Then | ADV | then
he | PRON | he
said | VERB | say
I | PRON | I
can | AUX | can
do | VERB | do
this | PRON | this
all | DET | all
day | NOUN | day
. | PUNCT | .


In [8]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")

for ent in doc.ents:
    print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))

Tesla Inc | GPE | Countries, cities, states
$45 billion | MONEY | Monetary values, including unit


In [9]:
from spacy import displacy

displacy.render(doc, style="ent")

### Adding pipeline manually

In [10]:
nlp = spacy.blank("en")

doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")

for ent in doc.ents:
    print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label))

In [11]:
source_nlp = spacy.load("en_core_web_md")

nlp = spacy.blank("en")
nlp.vocab.vectors = source_nlp.vocab.vectors

nlp.add_pipe("ner", source=source_nlp)
nlp.pipe_names

['ner']

In [12]:
doc = nlp("Tesla Inc is going to acquire twitter for $45 billion")

for ent in doc.ents:
    print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))

Tesla Inc | GPE | Countries, cities, states
$45 billion | MONEY | Monetary values, including unit


# Exercise

### Question 1:
- Get all the proper nouns from a given text in a list and also count how many of them.
- **Proper Noun** means a noun that names a particular person, place, or thing.

In [34]:
text = '''Ravi and Raju are the best friends from school days.They wanted to go for a world tour and 
visit famous cities like Paris, London, Dubai, Rome etc and also they called their another friend Mohan to take part of this world tour.
They started their journey from Hyderabad and spent next 3 months travelling all the wonderful cities in the world and cherish a happy moments!
'''

#### Expected Output:
    Proper Nouns: [Ravi, Raju, Paris, London, Dubai, Rome, Mohan, Hyderabad]
    Count: 8

In [35]:
nlp = spacy.load("en_core_web_md")

doc = nlp(text)

proper_nouns = []

for token in doc:
    if token.pos_ == 'PROPN':
        proper_nouns.append(token.text)

print(f"Proper Nouns: {proper_nouns}\nCount: {len(proper_nouns)}")

Proper Nouns: ['Ravi', 'Raju', 'Paris', 'London', 'Dubai', 'Rome', 'Mohan', 'Hyderabad']
Count: 8


### Question 2:
- Get all companies names from a given text and also the count of them.
- **Hint**: Use the spacy **ner** functionality

In [36]:
text = '''The Top 5 companies in USA are Tesla, Walmart, Amazon, Microsoft, Google and the top 5 companies in 
India are Infosys, Reliance, HDFC Bank, Hindustan Unilever and Bharti Airtel'''

#### Expected Output:
    Company Names: [Walmart, Amazon, Microsoft, Google, Infosys, Reliance, HDFC Bank, Hindustan Unilever]
    Count: 8

In [40]:
nlp = spacy.load("en_core_web_md")

doc = nlp(text)

company_names = []

for ent in doc.ents:
    #print(ent.text, "|", ent.label_)
    if ent.label_ == 'ORG':
        company_names.append(ent.text)

print(f"Company Names: {company_names}\nCount: {len(company_names)}")

Company Names: ['Walmart', 'Amazon', 'Microsoft', 'Google', 'Infosys', 'Reliance', 'HDFC Bank', 'Hindustan Unilever']
Count: 8
