In [1]:
from __future__ import print_function

<H1>This is a very short introduction to SpaCy.</H1>
for further reading please navigate to:
https://alpha.spacy.io/docs/usage/spacy-101

<h4>What is spaCy? </h4>
<p>
<ul>
<li>spaCy is a free, open-source library for advanced Natural Language Processing (NLP) in Python.</li>
<li>
spaCy is designed specifically for <b><em>production use</em></b> and helps you build applications that process and "understand" large volumes of text. It can be used to build information extraction or natural language understanding systems, or to pre-process text for deep learning.</li>
</ul></p>
<h4>What does it do?</h4>
<ul>
<li>Tokenization<p>   Segmenting text into words, punctuations marks etc.</p></li>
<li>Part-of-speech (POS) Tagging <p>Assigning word types to tokens, like verb or noun.</p>	</li>
<li>Dependency Parsing<p>	Assigning syntactic dependency labels, describing the relations between individual tokens, like subject or object.</p>	</li>
<li>Lemmatization<p>	Assigning the base forms of words. For example, the lemma of "was" is "be", and the lemma of "rats" is "rat".</p>	</li>
<li>Sentence Boundary Detection (SBD)<p>	Finding and segmenting individual sentences.</p>	</li>
<li>Named Entity Recongition (NER)	<p>Labelling named "real-world" objects, like persons, companies or locations.</p>	</li>
<li>Similarity<p>	Comparing words, text spans and documents and how similar they are to each other.</p>	</li>
<li>Text classification	<p>Assigning categories or labels to a whole document, or parts of a document.	</p></li>
<li>Rule-based Matching	<p>Finding sequences of tokens based on their texts and linguistic annotations, similar to regular expressions.</p>	</li>
<li>Model Training	<p>Updating and improving all statistical models.<p>	</li>
<li>Language Data<p> Comes packed with laguage models (and data) for various languages (English,German,Spanish,French)</p></li>
</ul>
<h4>Architecture Overview</h4>
<img src='architecture.svg'></img>


<h4>Let's see it in action</h4>

In [2]:
#Load the English language model in SpaCy
# this takes a little longer because there's a lot of data to load
import spacy                        
nlp = spacy.load('en')
print('Language:',nlp.lang)
print('Vocabulary size:',nlp.vocab.length)
print("Default NLP Pipeline:")
for obj in nlp.pipeline:
    print('\t',obj)

Language: en
Vocabulary size: 742225
Default NLP Pipeline:
	 <spacy.tagger.Tagger object at 0x10f878050>
	 <spacy.pipeline.DependencyParser object at 0x107a183c0>
	 <spacy.matcher.Matcher object at 0x107912578>
	 <spacy.pipeline.EntityRecognizer object at 0x107a189f0>


<h4>Default Pipeline</h4>
<img src = 'pipeline.svg'></img>

In [3]:
#Input text
txt =u"""Prescribing sick days due to diagnosis of influenza.Jane complains about flu-like symptoms.Jane may be experiencing some sort of flu episode.Jane’s RIDT came back negative for influenza.
Jane is at high risk for flu if she’s not vaccinated.Jane’s older brother had the flu last month.Jane had a severe case of flu last year.Joe expressed concerns about the risks of bird flu.
Joe shows no signs of stroke, except for numbness.Nausea, vomiting and ankle swelling negative.Patient denies alcohol abuse. Allergies: Penicillin, Dust, Sneezing.
There's an outbreak of happiness in New York organized by O'Reilly Media, today, September 26, 2017, involving thousands of people."""

#Call Spacy on the input text
#This runs the standard NLP pipeline on the input text
doc = nlp(txt) 
print(doc.text)

Prescribing sick days due to diagnosis of influenza.Jane complains about flu-like symptoms.Jane may be experiencing some sort of flu episode.Jane’s RIDT came back negative for influenza.
Jane is at high risk for flu if she’s not vaccinated.Jane’s older brother had the flu last month.Jane had a severe case of flu last year.Joe expressed concerns about the risks of bird flu.
Joe shows no signs of stroke, except for numbness.Nausea, vomiting and ankle swelling negative.Patient denies alcohol abuse. Allergies: Penicillin, Dust, Sneezing.
There's an outbreak of happiness in New York organized by O'Reilly Media, today, September 26, 2017, involving thousands of people.


##Sentence detection

In [4]:
data=[]
for sent in doc.sents:
    data.append((sent.start,sent.end,sent.text.replace('\n','')))
#For display purposes only we put the sentence boundry information in a Pandas DataFrame
import pandas as pd
sents = pd.DataFrame(data=data,columns = ['Start','End','Sentence Text'])
sents

Unnamed: 0,Start,End,Sentence Text
0,0,9,Prescribing sick days due to diagnosis of infl...
1,9,17,Jane complains about flu-like symptoms.
2,17,27,Jane may be experiencing some sort of flu epis...
3,27,37,Jane’s RIDT came back negative for influenza.
4,37,50,Jane is at high risk for flu if she’s not vacc...
5,50,60,Jane’s older brother had the flu last month.
6,60,70,Jane had a severe case of flu last year.
7,70,81,Joe expressed concerns about the risks of bird...
8,81,92,"Joe shows no signs of stroke, except for numbn..."
9,92,100,"Nausea, vomiting and ankle swelling negative."


### Part of speech tagging and Named Entity extraction

In [5]:
data = []
for sent in doc.sents:
    for w in sent:
        tmp=[]
        tmp.append(w.idx)
        tmp.append(w.text)
        tmp.append(w.lex_id)
        tmp.append(w.lemma_)
        tmp.append(w.pos_)
        tmp.append(w.head)
        tmp.append(w.dep_)
        tmp.append(w.ent_type_)
        data.append(tmp)
tokens = pd.DataFrame(data=data, columns = ['Index','Token','Id_in_vocab',
        'Lemma','POS','Depends_on','Dependency_type','Entity_Type'])
tokens

Unnamed: 0,Index,Token,Id_in_vocab,Lemma,POS,Depends_on,Dependency_type,Entity_Type
0,0,Prescribing,102853,prescribe,VERB,Prescribing,ROOT,DATE
1,12,sick,1057,sick,ADJ,days,amod,DATE
2,17,days,420,day,NOUN,Prescribing,dobj,DATE
3,22,due,688,due,ADJ,days,amod,
4,26,to,4,to,ADP,due,prep,
5,29,diagnosis,403187,diagnosi,NOUN,to,pobj,
6,39,of,7,of,ADP,diagnosis,prep,
7,42,influenza,66251,influenza,NOUN,of,pobj,
8,51,.,1,.,PUNCT,Prescribing,punct,
9,52,Jane,355245,jane,PROPN,complains,nsubj,PERSON


### Using the syntactic dependencies

In [6]:
from spacy.symbols import nsubj, VERB
# Finding a verb with a subject 
pairs = []
for possible_subject in doc:
    if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
        pairs.append((possible_subject,possible_subject.head))

for pair in pairs:
    print('Subject: ',pair[0],' verb: ',pair[1])

Subject:  Jane  verb:  complains
Subject:  Jane  verb:  experiencing
Subject:  RIDT  verb:  came
Subject:  Jane  verb:  is
Subject:  she  verb:  ’s
Subject:  brother  verb:  had
Subject:  Jane  verb:  had
Subject:  Joe  verb:  expressed
Subject:  Joe  verb:  shows
Subject:  Patient  verb:  denies


### Costum pipeline... Adding negation detection.

In [7]:
#loading NegEx and it's rules
from negex import *
rfile = open(r'negex_triggers.txt')
irules = sortRules(rfile.readlines())

#Define a new pipeline component (based on NegEx)
#Every pipeline component gets a Doc object and needs to return one
#To store custom data, spaCy currently has a document level variable: doc.user_data
#To store our negated words we add the index of the word (i) to a set under the 'negated' key

def negation_tag(doc):
    doc.user_data['negated']=set()
    for sent in doc.sents:
        ph= set()
        for word in sent:
            if word.pos_!='ADP' and word.pos_!='PUNCT':
                ph.add(word.text)
        tagger = negTagger(sentence = sent.text, phrases = list(ph),rules = irules, negP=False)
        scopes=  tagger.getScopes()
        res = set()
        for scope in scopes:
            s = scope.replace('[NEGATED]','').replace('.','').replace(',','')
            if ' ' in s:
                for wd in s.split(' '):
                    res.add(wd)
            else:
                res.add(s)
                
        for word in sent:
            if word.text in res:
                doc.user_data['negated'].add(word.i)
    return doc

#define a new pipleline including the negation_tag component
def custom_pipeline(nlp):
    return (nlp.tagger,nlp.parser,negation_tag)

#need to re-initlaize spaCy with the new pipeline
nlp_neg = spacy.load('en', create_pipeline=custom_pipeline)

In [8]:

doc2 = nlp_neg(txt)
for sent in doc2.sents:
    negs = []
    for word in sent:
        if word.i in doc2.user_data['negated']:
            negs.append(word)
    if len(negs)>0:
        print(sent)
        print('Negated words: ',negs)
        print()

Jane is at high risk for flu if she’s not vaccinated.
Negated words:  [vaccinated]

Joe shows no signs of stroke, except for numbness.
Negated words:  [stroke]

Patient denies alcohol abuse.
Negated words:  [alcohol, abuse]

