In [1]:
import spacy
import medspacy

from medspacy.util import DEFAULT_PIPENAMES

# Overview
Example of how to enable the default medspacy tokenizer and compare it to the default English tokenizer on 
some representative examples from short clinical text.

In [2]:
# we can only use one of the following tokenizers, so let's use the medspacy tokenizer 
# which handles infixes (e.g. 'h/o', 'chf+cp', etc)

medspacy_pipes = DEFAULT_PIPENAMES.copy()

if 'preprocessor' in medspacy_pipes: 
    medspacy_pipes.remove('preprocessor')

if 'postprocessor' in medspacy_pipes: 
    medspacy_pipes.remove('postprocessor')
    
if 'tokenizer' not in medspacy_pipes: 
    medspacy_pipes.add('tokenizer')
    
print(medspacy_pipes)
    
nlp = medspacy.load(enable = medspacy_pipes)

{'context', 'parser', 'sectionizer', 'tokenizer', 'sentencizer', 'target_matcher', 'tagger'}


In [3]:
nlp.tokenizer

<spacy.tokenizer.Tokenizer at 0x11dfc6dd0>

In [4]:
nlp.pipe_names

['sentencizer', 'tagger', 'parser', 'target_matcher', 'sectionizer', 'context']

# Process our document with both default and medspacy

In [5]:
example_text = r'Pt c\o n;v;d h\o chf+cp'

default_nlp = spacy.load('en_core_web_sm')

In [6]:
default_doc = default_nlp(example_text)

medspacy_doc = nlp(example_text)

In [7]:
print('Tokens in default tokenizer')
for token in default_doc:
    print(token.text)

Tokens in default tokenizer
Pt
c\o
n;v;d
h\o
chf+cp


In [8]:
print('Tokens in medspacy tokenizer')
for token in medspacy_doc:
    print(token.text)

Tokens in medspacy tokenizer
Pt
c
\
o
n
;
v
;
d
h
\
o
chf
+
cp


In [9]:
assert [token.text for token in default_doc] != [token.text for token in medspacy_doc]

In [10]:
tokenizer = nlp.tokenizer