# Purpose of this notebook is to attempt to modify rules in the default spacy tokenizer to fix common issues in clinical text

In [1]:
import spacy

import re
from spacy.tokenizer import Tokenizer

print('Spacy version: {}'.format(spacy.__version__))

Spacy version: 2.0.12


In [2]:
example_text = r'Pt c\o heart#burn'

In [3]:
# let's try the default tokenizer

In [4]:
default_nlp = spacy.load('en')

In [5]:
default_doc = default_nlp(example_text)

print('Tokens in default tokenizer')
for token in default_doc:
    print(token.text)

Tokens in default tokenizer
Pt
c\o
heart#burn


In [6]:
# NOTE: This was originally found by looking at GitHub issues on spacy:
# https://github.com/explosion/spaCy/issues/1494

# now let's try to modify the infix behavior
def my_tokenizer(nlp):
    
    infix_re = re.compile(r'''[^a-z0-9]''')
    
    return Tokenizer(nlp.vocab,
                     {},
                     infix_finditer=infix_re.finditer
    )

custom_nlp = spacy.load('en')
custom_nlp.tokenizer = my_tokenizer(custom_nlp)

In [7]:
custom_doc = custom_nlp(example_text)

print('Tokens in default tokenizer')
for token in custom_doc:
    print(token.text)

Tokens in default tokenizer
Pt
c
\
o
heart
#
burn
