In [1]:
import pandas as pd
import os

In [2]:
def ann_to_dict(nct_id): #took out file path arg
    '''
    nct_id: file name before .ann or .txt
    file_path: pathname to where .ann and .txt files are stored
    return: formatted dictionary
    '''
    
    # change directory
    #os.chdir(file_path)
    
    # read in files
    with open(nct_id+'.ann') as f1:
        ann=f1.read()
        f1.close()
    with open(nct_id+'.txt') as f2:
        txt=f2.read()
        f2.close()
        
    # ANN FILE MANIPULATION
    lines=ann.split('\n')
    ents=[]
    for i in range(len(lines)): 
        line=lines[i].split('\t') 
        if 'T' in line[0]:
            try: 
                start=txt.index(line[2])
                end= start+len(line[2])
                label=line[1].split(' ')[0]
                ents.append((start, end, label))
            except:
                None
            
    content={'entities': ents, 'text': txt}
    
    return content

In [6]:
import scispacy #
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.load("en_core_sci_sm") #
doc_bin_train = DocBin() # create a DocBin object for train data
doc_bin_test=DocBin() # create a new DocBin object for test data



In [7]:
os.chdir('/Users/meldrumapple/Desktop/Capstone/chia_with_scope') # change to wherever you have the chia_with_scope folder downloaded
doc_list=os.listdir()
for i in range(len(doc_list)):
    doc_list[i]=doc_list[i][0:-4]
doc_list=[x for x in doc_list if ('NCT' and 'inc') in x] # change this for LCT 

In [8]:
from spacy.util import filter_spans
import random
import re

In [11]:
l=len(doc_list)
test_idx=random.sample(doc_list, l//6)
train_idx = [x for x in doc_list if x not in test_idx]

print('Train Set:')
for d in train_idx:
    #print(d)
    try:
        doc_dict=ann_to_dict(d)
        text = doc_dict['text']
        labels = doc_dict['entities']
        doc = nlp.make_doc(text) 
        ents = []
        for start, end, label in labels:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is not None:
                #print(span)
                ents.append(span)
            else:
                None
        filtered_ents=filter_spans(ents)
        doc.ents=filtered_ents
        doc_bin_train.add(doc)
        
    except: 
        train_idx.remove(d) #
print(train_idx) #

print('Test Set:')
for d in test_idx:
    try:
        doc_dict=ann_to_dict(d)
        text = doc_dict['text']
        labels = doc_dict['entities']
        doc = nlp.make_doc(text) 
        ents = []
        for start, end, label in labels:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            #print(span)
            if span is not None:
                ents.append(span)
            else:
                None
        filtered_ents=filter_spans(ents)
        doc.ents=filtered_ents
        doc_bin_test.add(doc)
    except: 
        test_idx.remove(d) #
print(test_idx) #

Train Set:
['NCT03233880_inc', 'NCT03463564_inc', 'NCT02364648_inc', 'NCT02222272_inc', 'NCT01352598_inc', 'NCT02678377_inc', 'NCT01051414_inc', 'NCT02056288_inc', 'NCT03026465_inc', 'NCT03089086_inc', 'NCT02802644_inc', 'NCT02957305_inc', 'NCT03364036_inc', 'NCT01531257_inc', 'NCT02704754_inc', 'NCT01866800_inc', 'NCT02746900_inc', 'NCT03318874_inc', 'NCT02637076_inc', 'NCT02243553_inc', 'NCT02678663_inc', 'NCT01909934_inc', 'NCT02920177_inc', 'NCT03297021_inc', 'NCT00312429_inc', 'NCT01997112_inc', 'NCT02205931_inc', 'NCT02112734_inc', 'NCT03004209_inc', 'NCT02973035_inc', 'NCT02564471_inc', 'NCT03297944_inc', 'NCT02596555_inc', 'NCT01228279_inc', 'NCT00917891_inc', 'NCT03639519_inc', 'NCT02777580_inc', 'NCT03131050_inc', 'NCT03228498_inc', 'NCT00396734_inc', 'NCT02863120_inc', 'NCT02944292_inc', 'NCT02566226_inc', 'NCT02607748_inc', 'NCT02664558_inc', 'NCT02965443_inc', 'NCT02312960_inc', 'NCT01907230_inc', 'NCT02833623_inc', 'NCT03194074_inc', 'NCT02118467_inc', 'NCT02668016_inc', 

In [12]:
# make yourself a new folder somewhere to store model information 
os.chdir('/Users/meldrumapple/Desktop/Capstone/chia_mod') #change your directory to this folder
doc_bin_train.to_disk('training_data.spacy'); # save the docbin objects to this folder
doc_bin_test.to_disk('test_data.spacy');


Next you need to make a new file with the following text (just create a new text file, copy it in). 

Do this in your chia_mod folder or wherever you saved the training data. 

rename the txt file to 'base_config.cfg'

Next in the train and test lines at the top, you need to replace 'null' with your pathname (as a string!) to your training and testing files, so mine looks like: 

Then you need to open a terminal in jupyter lab, and change your directory to where you stored those files and your config files: 

Then you need to run the following line in your terminal for this notebook, so start a jupyter lab terminal. 

Then if that works run this in the same terminal (this step takes a long time): 

python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./test_data.spacy 

If all the terminal stuff works, try running this to test it: 

In [13]:
import spacy 
import os
os.chdir('/Users/meldrumapple/Desktop/Capstone/chia_mod')
nlp_ner = spacy.load("model-last")

doc= nlp_ner(
    '''
    Patients with biopsy-proven metastatic carcinoid tumors or other neuroendocrine tumors (Islet cell, Gastrinomas and VIPomas) with at least one measurable lesion (other than bone) that has either not been previously irradiated or if previously irradiated has demonstrated progression since the radiation therapy 
The patient has no major impairment of renal or hepatic function, as defined by the following laboratory parameters: total bilirubin <1.5 X ULN; AST, ALT<2.5X ULN (<5 X ULN if liver metastases are present) 
Patients on Sandostatin Lar (long acting somatostatin analogue) must be on a stable dose for 30 days prior to study entry and short acting somatostatin analogues must be judged to be on a clinically stable dose by the investigator prior to study entry 
Must have a life expectancy of greater than three (3) months 
Karnofsky Performance Status > 60 
Female patients must have a negative serum pregnancy test at screening. (Not applicable to patients with bilateral oophorectomy and/or hysterectomy or to those patients who are postmenopausal.) 
'''
)

spacy.displacy.render(doc, style="ent", jupyter=True)


In [14]:
doc1=nlp_ner('''
             Inclusion Criteria:
  -  Patients of both sexes with a diagnosis of stable CAD as follows: acute myocardial infarction or percutaneous coronary revascularization or coronary artery bypass graft surgery occured at least 12 months before entering the study or chronic stable angina.
Exclusion Criteria:
  -  Mental illness limiting the capacity of self-care or any condition limiting seriously the life expectancy less than 12 months.''')

spacy.displacy.render(doc1, style="ent", jupyter=True)

In [15]:
doc2=nlp_ner('''
Inclusion Criteria:
  -  BMI 30-40 kg/m2
  -  BMI 18-25 kg/m2; those with BMI up to 28kg/m2 will be included if waist circumference is <96cm.
  -  Males
  -  Aged 18-60yrs
Exclusion Criteria:
  -  Acute illness in the preceding 6 weeks
  -  Taking regular medication
  -  History of deep vein thrombosis or clotting disorders
  -  Hypertension
  -  Diabetes
  -  Any clinically significant findings at screening
  -  History of substance abuse
  -  Demonstrating factors precluding safe MRI
  -  History of gastrointestinal motility disorders (e.g. gastroesophageal reflux disease -irritable bowel syndrome, gastroparesis, sphincter of Oddi dysfunction, etc.)
  -  Previous thoracic or abdominal surgery.
  -  Those who report having ≤3 bowel movements/week or >2/day.)
''')
spacy.displacy.render(doc2, style="ent", jupyter=True)

In [16]:
doc4=nlp_ner('''
Inclusion Criteria:
  -  Participants will be male and female subjects who suffered total acute Achilles tendon rupture, and which underwent surgical repair. In addition, to participate in this study all volunteers will need to present medical and/or physiotherapeutic release for physical/sports activities practice.
Exclusion Criteria:
  -  Volunteers that did not have Achilles tendon surgical reconstruction, that did not present medical and/or physiotherapeutic release for physical/sports activities, who have participated in strength training program for the plantar flexors in the last 6 months, patients with diabetic diseases, as well as those with difficulty for understanding 
  and/or executing the test and training protocols in the isokinetic dynamometer will be excluded.
''')
spacy.displacy.render(doc4, style="ent", jupyter=True)

In [17]:
doc5=nlp_ner('''
Inclusion Criteria (ESRD group):
  -  Age ≥ 18 years,
  -  History of biopsy-proven lupus nephritis
  -  All classes of lupus nephritis
  -  Written informed consent
  -  Affiliation to a social security regime
Exclusion Criteria (ESRD group):
  -  Past-history of kidney transplantation
  -  Active infection
  -  Active allergy (such as hay fever)
  -  Pregnant or breastfeeding women
  -  Protected adults (individuals under guardianship by court order)
Inclusion criteria (non ESRD group):
  -  Age ≥ 18 years,
  -  Biopsy proven ACTIVE lupus nephritis
  -  First event of lupus nephritis
  -  No immunosuppressive therapy (including corticosteroids)
  -  Written informed consent
  -  Affiliation to a social security regime
Exclusion criteria (non ESRD group) :
  -  Immunosuppressive treatment
  -  Relapse of lupus nephritis under treatment
  -  Pregnant or breastfeeding women
  -  Protected adults (individuals under guardianship by court order)
''')
spacy.displacy.render(doc5, style="ent", jupyter=True)

In [18]:
# model-last with this run has 92% accuracy on the test set