In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
#Sentence tokenization
text = "Martin J. Thompson is known for his writing skills. He is also good at programming."
doc = nlp(text)
print(doc)

In [None]:
for sent in doc.sents:
    print (sent)

In [None]:
nlp.pipe_names

In [None]:
#Named Entity Recognition
doc = nlp("Tesla Inc is going to acquire Twitter Inc for $45 billion")
for ent in doc.ents:
    print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))

In [None]:
from spacy import displacy

displacy.render(doc, style="ent")

In [None]:
nlp.pipe_labels['ner']

In [None]:
doc = nlp("Apple Computer, Inc. was founded on April 1, 1976, by college dropouts Steve Jobs and Steve Wozniak")
for ent in doc.ents:
    print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))

In [None]:
# Customize entities
doc = nlp("Tesla is going to acquire Twitter for $45 billion")
for ent in doc.ents:
    print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))

In [None]:
type(doc[5:6])

In [None]:
# Customize entities
from spacy.tokens import Span

s1 = Span(doc, 0, 1, label="ORG")
s2 = Span(doc, 5, 6, label="ORG")

doc.set_ents([s1, s2], default="unmodified")

In [None]:
for ent in doc.ents:
    print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))

In [None]:
#Part of Speech taggin
text = "In the field of computational linguistics, understanding parts-of-speech is essential. SpaCy offers an easy way to parse a text and identify its parts of speech."

In [None]:
doc = nlp(text)
for token in doc:
    print(token.text, token.pos_)

In [None]:
#Extracting Nouns and Noun Chunks
for chunk in doc.noun_chunks:
    print(chunk.text)

In [None]:
#Extracting verbs
text = "Apple Computer, Inc. was founded on April 1, 1976, by college dropouts Steve Jobs and Steve Wozniak"
doc = nlp(text)

verbs = ["VERB", "AUX"]
for token in doc:
    if token.pos_ in verbs:
        print (token.text, token.pos_)

In [None]:
#Extracting verb phrases
#We import the PhraseMatcher
from spacy.matcher import Matcher

nlp_matcher = spacy.load("en_core_web_sm")
matcher = Matcher(nlp_matcher.vocab)

#We create our patterns as a list of dictionaries
pattern = [
    [{"POS": "AUX"}, {"POS": "VERB"}]
]

matcher.add("verb-phrases", pattern)

doc2 = nlp_matcher(text)
matches = matcher(doc2)
for match in matches:
    print(match)
    span = doc[match[1]:match[2]]
    print(span)

In [None]:
#lematization with spaCy
for token in doc:
    print(token.text, token.lemma_)