In [1]:
# Perform Standard imports
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
doc = nlp("This is the first sentence. This is the another sentence. This one is the last sentence.")

In [3]:
for sent in doc.sents:
    print(sent)

This is the first sentence.
This is the another sentence.
This one is the last sentence.


In [4]:
doc[0]

This

```python
doc.sents[0]
# TypeError: 'generator' object is not subscriptable
```

In [6]:
 list(doc.sents)[0]

This is the first sentence.

In [7]:
type(list(doc.sents)[0])

spacy.tokens.span.Span

In [8]:
doc1 = nlp('Management is doing the right things; leadership is doing the right thigns."  -Peter Drucker')

In [9]:
doc1.text

'Management is doing the right things; leadership is doing the right thigns."  -Peter Drucker'

In [10]:
for sent in doc1.sents:
    print(sent)

Management is doing the right things; leadership is doing the right thigns."  
-Peter Drucker


In [11]:
# Upto but not including last token
doc1[:-1]

Management is doing the right things; leadership is doing the right thigns."  -Peter

## Add a sengmentation Rule

In [13]:
# import spacy
from spacy.language import Language

# nlp = spacy.load("en_core_web_sm")

# ✅ Register the function as a spaCy component
@Language.component("custom_segmentation")
def custom_segmentation(doc):
    for i, token in enumerate(doc[:-1]):
        if token.text in ("e.g.", "Note:"):
            doc[i + 1].is_sent_start = True
    return doc

# ✅ Add the custom component before the parser
nlp.add_pipe("custom_segmentation", before="parser")

# Test text
text = "This is a test. e.g. we test custom splitting. Note: this should also split."

doc = nlp(text)

# Print the segmented sentences
for sent in doc.sents:
    print("👉", sent.text)


👉 This is a test.
👉 e.g.
👉 we test custom splitting.
👉 Note: this should also split.


## 🔍 Why does spaCy tokenize "Note:" into two tokens?
- Even though "e.g." is a single token, "Note:" is two tokens:
    1. "Note"
    2.  ":"
- This is by design, due to how spaCy’s tokenizer handles punctuation and known abbreviations.
---

# ✅ spaCy Tokenizer Logic
- spaCy’s tokenizer uses rules based on:
    1. prefixes (e.g., $, (, ")
    2. suffixes (e.g., ., !, ?, :)
    3. infixes (e.g., -, /, ')
---

In [15]:
# import spacy
# nlp = spacy.load("en_core_web_sm")

doc = nlp("Note:")
for token in doc:
    print(repr(token.text))


'Note'
':'


# 🧠 Why?
- : is a suffix punctuation, so it gets split.

- spaCy doesn't treat "Note:" as an abbreviation like "e.g.", "i.e.", or "Dr.".

# ✅ Why is "e.g." a Single Token Then?

In [17]:
doc = nlp("e.g.")
for token in doc:
    print(repr(token.text))


'e.g.'


- Because: 
    1. "e.g." is in spaCy’s abbreviation exceptions
    2. spaCy doesn't split on periods inside known abbreviations
---
# 🛠️ How to Treat "Note:" as a Single Token?
- You can override the tokenizer if you really want "Note:" as one token: 

In [19]:
from spacy.symbols import ORTH
from spacy.tokens import Doc

# Add custom tokenizer rule to treat "Note:" as a single token
nlp.tokenizer.add_special_case("Note:", [{ORTH: "Note:"}])

doc = nlp("Note: this is a test.")

for token in doc:
    print(token.text)


Note:
this
is
a
test
.


### Now "Note:" is treated like a single token ✅
---
# ✅ Summary
| Phrase    | Default Tokens  | Why?                          |
| --------- | --------------- | ----------------------------- |
| `"e.g."`  | `'e.g.'`        | Known abbreviation, not split |
| `"Note:"` | `'Note'`, `':'` | Colon is suffix punctuation   |
| `"Dr."`   | `'Dr.'`         | Known abbreviation            |

---

# 🚀 When to Use Special Cases?
- Use add_special_case() if:
    1. You want to treat "Note:", "PS:", "FYI:", etc., as atomic units
    2. You're customizing sentence segmentation or entity recognition

# 🔍 Explanation for Custom Segementation Component
| Step                          | Purpose                                                       |
| ----------------------------- | ------------------------------------------------------------- |
| `@Language.component("name")` | Registers your function in spaCy's component registry         |
| `"custom_segmentation"`       | Name of your component when adding with `nlp.add_pipe(...)`   |
| `before="parser"`             | Ensures sentence boundaries are set before dependency parsing |

# ⚠️ Note on Pipeline Order
 - Custom sentence segmenters must be placed before components like _**"parser"**_ or _**"ner"**_ because:
 - Sentence boundaries affect how parsing and NER behave
 - Use <code>nlp.add_pipe(custom_component, before="parser")</code> for best results


In [22]:
# import spacy
# from spacy.language import Language

# # Load model
# nlp = spacy.load("en_core_web_sm")

# Register custom component
@Language.component("custom_segmentation1")
def custom_segmentation(doc):
    # Iterate and detect pattern: Note :
    for i in range(len(doc) - 2):
        if doc[i].text == "Note" and doc[i + 1].text == ":":
            # Set "Note" as sentence start (optional)
            doc[i].is_sent_start = True
            # ✅ Force next token after ":" as sentence start
            doc[i + 2].is_sent_start = True
    return doc

# Add before parser
nlp.add_pipe("custom_segmentation1", before="parser")

# Text
text = "This is a test. e.g. we test custom splitting. Note: this should also split."

# Apply
doc = nlp(text)

# Output
for sent in doc.sents:
    print("👉", sent.text)


👉 This is a test.
👉 e.g.
👉 we test custom splitting.
👉 Note:
👉 this should also split.


In [23]:
nlp.pipe_names

['tok2vec',
 'tagger',
 'custom_segmentation',
 'custom_segmentation1',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner']

In [57]:
doc = nlp('This is the first. \n\n This is the second\n\n. This is the third.\n This is the fourth.')
for sent in doc.sents:
    print(sent)

This is the first. 

 
This is the second

.
This is the third.
 
This is the fourth.


# Use Custom Sentence Boundary Detection 
- via retokenizer and custom_sentencizer
- You can write your own function to decide sentence boundaries using custom rules:

In [72]:
import spacy
from spacy.language import Language

@Language.component("custom_sentencizer")
def custom_sentencizer(doc):
    for i, token in enumerate(doc[:-1]):
        if token.text == "Note:":
            doc[i+1].is_sent_start = True
    return doc

nlp = spacy.blank("en")
nlp.add_pipe("sentencizer")  # Ensure basic sentence splitting
nlp.add_pipe("custom_sentencizer", after="sentencizer")

doc = nlp("This is a sentence. Note: this should start a new sentence.")
for sent in doc.sents:
    print(sent.text)


This is a sentence.
Note: this should start a new sentence.
