In [1]:
# Perform standard imports
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
# From Spacy Basics:
doc = nlp(u"This is the first sentence. This is another sentence. This is the last sentence.")
for sent in doc.sents:
  print(sent)

This is the first sentence.
This is another sentence.
This is the last sentence.


In [3]:
print(doc[1])

is


In [4]:
print(doc.sents[1])

TypeError: '_cython_3_2_0.generator' object is not subscriptable

In [5]:
doc_sents = [sent for sent in doc.sents]
doc_sents

[This is the first sentence.,
 This is another sentence.,
 This is the last sentence.]

In [6]:
list(doc.sents)

[This is the first sentence.,
 This is another sentence.,
 This is the last sentence.]

In [7]:
# Now we can access individual sentences:
print(doc_sents[1])

This is another sentence.


#### Adding Rules

In [8]:
# Parsing the segmentation start tokens happens during the nlp pipeline
doc2 = nlp(u"This is a sentence. This is a sentence. This is a sentence")
for token in doc2:
  print(token.is_sent_start, " " + token.text)

True  This
False  is
False  a
False  sentence
False  .
True  This
False  is
False  a
False  sentence
False  .
True  This
False  is
False  a
False  sentence


In [9]:
# SPACY'S DEFAULT BEHAVIOUR
doc3 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Druker')

for sent in doc3.sents:
  print(sent)

"Management is doing things right; leadership is doing the right things."
-Peter Druker


In [10]:
# adding a new rule to the pipeline
from spacy.language import Language
@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
  for token in doc[:-1]:
    if token.text == ";":
      doc[token.i+1].is_sent_start = True
  return doc

nlp.add_pipe("set_custom_boundaries", before="parser")
nlp.pipe_names


['tok2vec',
 'tagger',
 'set_custom_boundaries',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner']

In [11]:
# Re-run the Doc onject creation:
doc4 = nlp(u'"Management is doing things right; leadership is doing the right things." -Peter Druker')
for sent in doc4.sents:
  print(sent)

"Management is doing things right;
leadership is doing the right things."
-Peter Druker
