In [1]:
import sys
sys.path.insert(0, "..")

In [2]:
import spacy
import medspacy

from medspacy.util import DEFAULT_PIPENAMES

# Overview
Example of how to enable the default medspacy tokenizer and compare it to the default English tokenizer on 
some representative examples from short clinical text.

In [3]:
# we can only use one of the following tokenizers, so let's use the medspacy tokenizer 
# which handles infixes (e.g. 'h/o', 'chf+cp', etc)

medspacy_pipes = DEFAULT_PIPENAMES.copy()

if 'preprocessor' in medspacy_pipes: 
    medspacy_pipes.remove('preprocessor')

if 'postprocessor' in medspacy_pipes: 
    medspacy_pipes.remove('postprocessor')
    
if 'tokenizer' not in medspacy_pipes: 
    medspacy_pipes.add('tokenizer')
    
print(medspacy_pipes)
    
nlp = medspacy.load(enable = medspacy_pipes)

{'medspacy_pyrush', 'tokenizer', 'medspacy_context', 'medspacy_tokenizer', 'medspacy_target_matcher'}




In [4]:
nlp.tokenizer

<spacy.tokenizer.Tokenizer at 0x7ff8b0c98280>

In [5]:
nlp.pipe_names

['medspacy_pyrush', 'medspacy_target_matcher', 'medspacy_context']

# Process our document with both default and medspacy

In [6]:
example_text = r'Pt c\o n;v;d h\o chf+cp'

default_nlp = spacy.load('en_core_web_sm')

  reader(path / key)
  reader(path / key)
  reader(path / key)


In [7]:
default_doc = default_nlp(example_text)

medspacy_doc = nlp(example_text)

In [8]:
print('Tokens in default tokenizer')
for token in default_doc:
    print(token.text)

Tokens in default tokenizer
Pt
c\o
n;v;d
h\o
chf+cp


In [9]:
print('Tokens in medspacy tokenizer')
for token in medspacy_doc:
    print(token.text)

Tokens in medspacy tokenizer
Pt
c
\
o
n
;
v
;
d
h
\
o
chf
+
cp


In [10]:
assert [token.text for token in default_doc] != [token.text for token in medspacy_doc]

In [11]:
tokenizer = nlp.tokenizer