In [1]:
import spacy

In [3]:
nlp = spacy.load("en_core_web_sm")

# Parts of Speech

In [7]:
doc  = nlp("Elon flew to Mars yesterday. He carried biryani masala with him")

for token in doc:
    print(token, "|", token.pos_, "|", spacy.explain(token.pos_))

Elon | PROPN | proper noun
flew | VERB | verb
to | ADP | adposition
Mars | PROPN | proper noun
yesterday | NOUN | noun
. | PUNCT | punctuation
He | PRON | pronoun
carried | VERB | verb
biryani | ADJ | adjective
masala | NOUN | noun
with | ADP | adposition
him | PRON | pronoun


In [13]:
doc = nlp("Wow! Dr. Strange made, 265 million $ on the very first day")

for token in doc:
    print(token, "|", token.pos_,"|", spacy.explain(token.pos_),"|", token.tag_, "|", spacy.explain(token.tag_))

Wow | INTJ | interjection | UH | interjection
! | PUNCT | punctuation | . | punctuation mark, sentence closer
Dr. | PROPN | proper noun | NNP | noun, proper singular
Strange | PROPN | proper noun | NNP | noun, proper singular
made | VERB | verb | VBD | verb, past tense
, | PUNCT | punctuation | , | punctuation mark, comma
265 | NUM | numeral | CD | cardinal number
million | NUM | numeral | CD | cardinal number
$ | NUM | numeral | CD | cardinal number
on | ADP | adposition | IN | conjunction, subordinating or preposition
the | DET | determiner | DT | determiner
very | ADV | adverb | RB | adverb
first | ADJ | adjective | JJ | adjective (English), other noun-modifier (Chinese)
day | NOUN | noun | NN | noun, singular or mass


In [14]:
doc = nlp("He quits the job")

for token in doc:
    print(token, "|", token.pos_,"|", spacy.explain(token.pos_),"|", token.tag_, "|", spacy.explain(token.tag_))

He | PRON | pronoun | PRP | pronoun, personal
quits | VERB | verb | VBZ | verb, 3rd person singular present
the | DET | determiner | DT | determiner
job | NOUN | noun | NN | noun, singular or mass


In [15]:
doc = nlp("He quit the job")

for token in doc:
    print(token, "|", token.pos_,"|", spacy.explain(token.pos_),"|", token.tag_, "|", spacy.explain(token.tag_))

He | PRON | pronoun | PRP | pronoun, personal
quit | VERB | verb | VBD | verb, past tense
the | DET | determiner | DT | determiner
job | NOUN | noun | NN | noun, singular or mass


# Filter Token

In [18]:
earning_text = '''Microsoft Corp. today announced the following results for the quarter ended March 31, 2024, as compared to the corresponding period of last fiscal year:

·        Revenue was $61.9 billion and increased 17%
·        Operating income was $27.6 billion and increased 23%
·        Net income was $21.9 billion and increased 20%
·        Diluted earnings per share was $2.94 and increased 20%
“Microsoft Copilot and Copilot stack are orchestrating a new era of AI transformation, etc. driving better business outcomes across every role and industry," said Satya Nadella, chairman and chief executive officer of Microsoft.'''

In [19]:
doc = nlp(earning_text)

for token in doc:
    print(token, "|", token.pos_,"|", spacy.explain(token.pos_))

Microsoft | PROPN | proper noun
Corp. | PROPN | proper noun
today | NOUN | noun
announced | VERB | verb
the | DET | determiner
following | VERB | verb
results | NOUN | noun
for | ADP | adposition
the | DET | determiner
quarter | NOUN | noun
ended | VERB | verb
March | PROPN | proper noun
31 | NUM | numeral
, | PUNCT | punctuation
2024 | NUM | numeral
, | PUNCT | punctuation
as | SCONJ | subordinating conjunction
compared | VERB | verb
to | ADP | adposition
the | DET | determiner
corresponding | ADJ | adjective
period | NOUN | noun
of | ADP | adposition
last | ADJ | adjective
fiscal | ADJ | adjective
year | NOUN | noun
: | PUNCT | punctuation


 | SPACE | space
· | PUNCT | punctuation
        | SPACE | space
Revenue | NOUN | noun
was | AUX | auxiliary
$ | SYM | symbol
61.9 | NUM | numeral
billion | NUM | numeral
and | CCONJ | coordinating conjunction
increased | VERB | verb
17 | NUM | numeral
% | NOUN | noun

 | SPACE | space
· | PUNCT | punctuation
        | SPACE | space
Operating | V

In [23]:
doc = nlp(earning_text)


filtered_tokens = []
for token in doc:
    if token.pos_ not in ["SPACE", "X", "PUNCT"]:
        filtered_tokens.append(token)

filtered_tokens

[Microsoft,
 Corp.,
 today,
 announced,
 the,
 following,
 results,
 for,
 the,
 quarter,
 ended,
 March,
 31,
 2024,
 as,
 compared,
 to,
 the,
 corresponding,
 period,
 of,
 last,
 fiscal,
 year,
 Revenue,
 was,
 $,
 61.9,
 billion,
 and,
 increased,
 17,
 %,
 Operating,
 income,
 was,
 $,
 27.6,
 billion,
 and,
 increased,
 23,
 %,
 Net,
 income,
 was,
 $,
 21.9,
 billion,
 and,
 increased,
 20,
 %,
 Diluted,
 earnings,
 per,
 share,
 was,
 $,
 2.94,
 and,
 increased,
 20,
 %,
 Microsoft,
 Copilot,
 and,
 Copilot,
 stack,
 are,
 orchestrating,
 a,
 new,
 era,
 of,
 AI,
 transformation,
 driving,
 better,
 business,
 outcomes,
 across,
 every,
 role,
 and,
 industry,
 said,
 Satya,
 Nadella,
 chairman,
 and,
 chief,
 executive,
 officer,
 of,
 Microsoft]

# Count parts of speech

In [24]:
count = doc.count_by(spacy.attrs.POS)
count

{96: 9,
 92: 24,
 100: 13,
 90: 5,
 85: 7,
 93: 13,
 97: 13,
 98: 1,
 84: 8,
 103: 9,
 87: 5,
 99: 4,
 89: 7,
 101: 2}

In [28]:
for k,v in count.items():
    print(doc.vocab[k].text, "|", v)

PROPN | 9
NOUN | 24
VERB | 13
DET | 5
ADP | 7
NUM | 13
PUNCT | 13
SCONJ | 1
ADJ | 8
SPACE | 9
AUX | 5
SYM | 4
CCONJ | 7
X | 2
