In [1]:
import spacy

In [2]:
nlp = spacy.load ("en_core_web_sm")

In [10]:
doc = nlp ("This is first sentence. This is second sentence.This is last sentence")

In [11]:
doc.sents       # A Generator Object, so we will need to iterate over it

<generator at 0x7fe36bad2210>

In [12]:
for sent in doc.sents:
    print (sent)

This is first sentence.
This is second sentence.
This is last sentence


In [13]:
# Custom Sentence Segmenting

In [18]:
doc = nlp(u'"Management is doing the right things; Leadership is doing the right things;." - Peter Druker')

In [19]:
for sent in doc.sents:
    print (sent)

"
Management is doing the right things; Leadership is doing the right things;." - Peter Druker


In [24]:
def set_custom_boundaries (doc):
    for token in doc[:-1]:
        if token.text == ';':
            doc[token.i+1].is_sent_start = True
    return doc

In [35]:
nlp.add_pipe (set_custom_boundaries, before='parser')

In [36]:
nlp.pipe_names

['tagger', 'set_custom_boundaries', 'parser', 'ner']

In [37]:
doc = nlp(u'"Management is doing the right things; Leadership is doing the right things;." - Peter Druker')

In [38]:
for sent in doc.sents:
    print (sent)

"
Management is doing the right things;
Leadership is doing the right things;." - Peter Druker


In [1]:
## Changing Segmentation Rules

In [4]:
import spacy
nlp = spacy.load ("en_core_web_sm")

In [5]:
mystring = "This is a sentence. This is another.\n\nThis is a\n third one."

In [6]:
print (mystring)

This is a sentence. This is another.

This is a
 third one.


In [7]:
from spacy.pipeline import SentenceSegmenter

In [31]:
def split_on_newlines (doc):
    start = 0
    seen_nl = False
    for word in doc:
        if seen_nl:
            yield doc[start:word.i]
            start = word.i
            seen_nl = False
        elif word.text.startswith ('\n'):
            seen_nl = True
       
    yield doc[start:]
    

In [32]:
segmt = SentenceSegmenter (nlp.vocab, strategy = split_on_newlines)

In [36]:
nlp.pipe_names

['tagger', 'parser', 'ner', 'sentencizer']

In [39]:
nlp.remove_pipe ('sentencizer')

ValueError: [E001] No component 'sentencizer' found in pipeline. Available names: ['tagger', 'parser', 'ner']

In [40]:
nlp.add_pipe (segmt)

In [41]:
mystring = "This is a sentence. This is another.\n\nThis is a\n third one."

In [42]:
doc4 = nlp (mystring)

In [43]:
for sent in doc4.sents:
    print (sent)

This is a sentence. This is another.


This is a
 
third one.
