# Import libraries

In [1]:
import spacy

%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
nlp = spacy.load("en_core_web_sm")

<IPython.core.display.Javascript object>

# Tokenization

In [3]:
doc = nlp("Apple is looking at buying U.K startup for $1 billion")

<IPython.core.display.Javascript object>

In [4]:
for token in doc:
    print(token.text)

Apple
is
looking
at
buying
U.K
startup
for
$
1
billion


<IPython.core.display.Javascript object>

# Part of speech [POS] Tagging, lemmatization, stop_words

In [5]:
doc

Apple is looking at buying U.K startup for $1 billion

<IPython.core.display.Javascript object>

In [6]:
for token in doc:
    print(f"{token.text:{15}} {token.lemma_:{15}} {token.pos_:{10}} {token.is_stop}")

Apple           Apple           PROPN      False
is              be              AUX        True
looking         look            VERB       False
at              at              ADP        True
buying          buy             VERB       False
U.K             U.K             PROPN      False
startup         startup         NOUN       False
for             for             ADP        True
$               $               SYM        False
1               1               NUM        False
billion         billion         NUM        False


<IPython.core.display.Javascript object>

# Dependency parsing

In [7]:
for chunk in doc.noun_chunks:
    print(f"{chunk.text:{30}} {chunk.root.text:{15}} {chunk.root.dep_}")

Apple                          Apple           nsubj
U.K                            U.K             dobj
startup                        startup         dobj


<IPython.core.display.Javascript object>

# Named entity recognition

In [8]:
doc

Apple is looking at buying U.K startup for $1 billion

<IPython.core.display.Javascript object>

In [9]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
U.K ORG
$1 billion MONEY


<IPython.core.display.Javascript object>

# Sentence segmentation

In [10]:
doc.sents

<generator at 0x1e4d8d2f240>

<IPython.core.display.Javascript object>

In [11]:
for sent in doc.sents:
    print(sent)

Apple is looking at buying U.K startup for $1 billion


<IPython.core.display.Javascript object>

In [12]:
doc2 = nlp(
    "Welcome my GitHub repo. Thanks for reading my code. If you like please give star for repo."
)

<IPython.core.display.Javascript object>

In [13]:
for sent in doc2.sents:
    print(sent)

Welcome my GitHub repo.
Thanks for reading my code.
If you like please give star for repo.


<IPython.core.display.Javascript object>

This sentence needs custom sentence segmenter. Because sentence segmantation doing with "." by default.

In [14]:
doc3 = nlp(
    "Welcome my GitHub repo-Thanks for reading my code-If you like please give star for repo."
)
for sent in doc3.sents:
    print(sent)

Welcome my GitHub repo-Thanks for reading my code-If you like please give star for repo.


<IPython.core.display.Javascript object>

### Custom segmentation rule

In [15]:
from spacy.language import Language


@Language.component("component")
def set_rule(doc):
    for token in doc[:-1]:
        if token.text == "-":
            doc[token.i + 1].is_sent_start = True

    return doc

<IPython.core.display.Javascript object>

In [16]:
# nlp.remove_pipe("component")

<IPython.core.display.Javascript object>

In [17]:
nlp.add_pipe("component", before="parser")

<function __main__.set_rule(doc)>

<IPython.core.display.Javascript object>

In [18]:
doc3 = nlp(
    "Welcome my GitHub repo-Thanks for reading my code-If you like please give star for repo-"
)
for sent in doc3.sents:
    print(sent)

Welcome my GitHub repo-
Thanks for reading my code-
If you like please give star for repo-


<IPython.core.display.Javascript object>

# Visualization

In [19]:
from spacy import displacy

<IPython.core.display.Javascript object>

In [20]:
doc

Apple is looking at buying U.K startup for $1 billion

<IPython.core.display.Javascript object>

In [22]:
displacy.render(doc, style="dep")

<IPython.core.display.Javascript object>

In [23]:
displacy.render(doc, style="dep", options={"compact": True, "distance": 100})

<IPython.core.display.Javascript object>

Entity recognition visualization

In [24]:
displacy.render(doc, style="ent")

<IPython.core.display.Javascript object>