In [43]:
import pandas as pd
import os
import random

In [44]:
# Define ANN to Dict function to convert ANN files, works for both corpera
def ann_to_dict(nct_id): #took out file path arg
    '''
    nct_id: file name before .ann or .txt
    file_path: pathname to where .ann and .txt files are stored
    return: formatted dictionary
    '''
    
    # change directory
    #os.chdir(file_path)
    
    # read in files
    with open(nct_id+'.ann') as f1:
        ann=f1.read()
        f1.close()
    with open(nct_id+'.txt') as f2:
        txt=f2.read()
        f2.close()
        
    # ANN FILE MANIPULATION
    lines=ann.split('\n')
    ents=[]
    for i in range(len(lines)): 
        line=lines[i].split('\t') 
        if 'T' in line[0]:
            try: 
                start=txt.index(line[2])
                end= start+len(line[2])
                label=line[1].split(' ')[0]
                ents.append((start, end, label))
            except:
                None
            
    content={'entities': ents, 'text': txt}
    
    return content

In [45]:
# Get list of LCT docs
os.chdir('/Users/meldrumapple/Desktop/Capstone/lct_corpus') 
doc_list_lct=os.listdir()
for i in range(len(doc_list_lct)):
    doc_list_lct[i]=doc_list_lct[i][0:-4]
doc_list_lct=list(set([x for x in doc_list_lct if ('NCT') in x])) 
len(doc_list_lct)

1006

In [46]:
# Get List of Chia Docs
os.chdir('/Users/meldrumapple/Desktop/Capstone/chia_noscope_corpus') 
doc_list_chia=os.listdir()
for i in range(len(doc_list_chia)):
    doc_list_chia[i]=doc_list_chia[i][0:-8]
doc_list_chia=list(set([x for x in doc_list_chia if ('NCT') in x])) 
len(doc_list_chia)

1000

In [47]:
import scispacy 
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.load("en_core_sci_sm") #Load in scispacy small as initial model
doc_bin_train = DocBin() # create a DocBin object for train data
doc_bin_test=DocBin() # create a new DocBin object for test data

In [48]:
#define helper functions that maps lct/chia labels to our labels
fail_list_chia=[] #so we can see which entities aren't being included
def chia_map(label):
    '''
    label (str): the chia label to be mapped to our label system
    '''
    
    map_dict={
    'Person':'Person',
    'Condition':'Condition',
    'Drug':'Drug',
    'Observation':'Observation',
    'Measurement':'Observation',
    'Procedure':'Procedure',
    'Device':'Procedure',
    'Visit': 'Encounter',
    'Negation':'Negation',
    'Qualifier':'Qualifier',
    'Temporal':'Temporal',
    'Value':'Value',
    'Multiplier':'Qualifier',
    'Reference_point':'Reference_point',
    'Mood':'Indication',
    'Post-eligibility':'Consent',
    'Pregnancy_considerations':'Demographic',
    'Informed_consent':'Consent'
    }
    
    try: 
        new_lab=map_dict[label]
    except:
        new_lab='fail'
        if label not in fail_list_chia:
            fail_list_chia.append(label)
    return new_lab

fail_list_lct=[]
def lct_map(label):
    '''
    label (str): the lct label to be mapped to our label system
    '''
    
    map_dict={
    'Allergy':'Condition',
    'Condition':'Condition',
    'Contraindication':'Indication',
    'Drug':'Drug',
    'Encounter':'Encounter',
    'Indication':'Indication',
    'Immunization':'Drug',
    'Observation':'Observation',
    'Procedure':'Procedure',
    'Age':'Demographic',
    'Birth':'Observation',
    'Death':'Observation',
    'Ethnicity':'Demographic',
    'Family-member':'Observation',
    'Language':'Demographic',
    'Life-Stage-And-Gender':'Demographic',
    'Exception':'Negation',
    'Negation':'Negation',
    'Acuteness':'Qualifier',
    'Assertion':'Indication',
    'Modifier':'Qualifier',
    'Polarity':'Qualifier',
    'Risk':'Qualifier',
    'Severity':'Qualifier',
    'Stability':'Qualifier',
    'Eq-Comparison':'Value',
    'Eq-Temporal-Period':'Temporal',
    'Eq-Temporal-Recency':'Temporal',
    'Eq-Temporal-Unit':'Temporal',
    'Eq-Unit':'Value'
    }
    
    try: 
        new_lab=map_dict[label]
    except:
        new_lab='fail'
        if label not in fail_list_lct:
            fail_list_lct.append(label)
    return new_lab

In [49]:
#define helper function to add docs to bin
def doc_to_bin(doc, bin_obj, chia=False):
    '''
    doc (str): name of doc to add to bin
    bin_obj (spacy bin object): the bin object to add doc spans to
    chia (bool): is this a chia doc
    '''
    doc_dict=ann_to_dict(doc)
    text = doc_dict['text']
    labels = doc_dict['entities']
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in labels:
        if chia==False:
            label=lct_map(label)
        if chia==True: 
            label=chia_map(label)
        if label != 'fail':
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is not None:
                #print(span)
                ents.append(span)
    #print(ents)
    filtered_ents=spacy.util.filter_spans(ents)
    doc.ents=filtered_ents
    bin_obj.add(doc)
    return None

In [50]:
# add lct trials to doc bins
os.chdir('/Users/meldrumapple/Desktop/Capstone/lct_corpus') 
n_lct=len(doc_list_lct)
test_idx_lct=random.sample(doc_list_lct, n_lct//6)
train_idx_lct = [x for x in doc_list_lct if x not in test_idx_lct]

print('Train Set LCT:')
for d in train_idx_lct:
    try: 
        doc_to_bin(d, doc_bin_train)
    except: 
        train_idx_lct.remove(d)
print(train_idx_lct)

print('Test Set LCT:')
for d in test_idx_lct:
    try: 
        doc_to_bin(d, doc_bin_test)
    except: 
        test_idx_lct.remove(d)
print(test_idx_lct)

print('Failed LCT Labels')
print(fail_list_lct)

# add chia trials to doc bins
os.chdir('/Users/meldrumapple/Desktop/Capstone/chia_noscope_corpus') 
n_chia=len(doc_list_chia)
test_idx_chia=random.sample(doc_list_chia, n_chia//6)
train_idx_chia = [x for x in doc_list_chia if x not in test_idx_chia]

print('Train Set Chia:')
for d in train_idx_chia:
    #try: 
    doc_to_bin(d+'_inc', doc_bin_train, chia=True)
    doc_to_bin(d+'_exc', doc_bin_train, chia=True)
    #except: 
        #train_idx_chia.remove(d)
print(train_idx_chia)

print('Test Set Chia:')
for d in test_idx_chia:
    #try: 
    doc_to_bin(d+'_inc', doc_bin_test, chia=True)
    doc_to_bin(d+'_exc', doc_bin_test, chia=True)
    #except: 
        #test_idx_chia.remove(d)
print(test_idx_chia)

print('Failed Chia Labels:')
print(fail_list_chia)

Train Set LCT:
['NCT03862027', 'NCT03865108', 'NCT03922568', 'NCT03929744', 'NCT03921112', 'NCT03864965', 'NCT03924999', 'NCT03927391', 'NCT03924375', 'NCT03921177', 'NCT03920020', 'NCT03860974', 'NCT03869515', 'NCT03861182', 'NCT03863535', 'NCT03867500', 'NCT03868371', 'NCT03862495', 'NCT03867552', 'NCT03930901', 'NCT03866655', 'NCT03923075', 'NCT03926637', 'NCT03862235', 'NCT03860454', 'NCT03921710', 'NCT03861468', 'NCT03869112', 'NCT03921606', 'NCT03868527', 'NCT03921970', 'NCT03862846', 'NCT03862456', 'NCT03925168', 'NCT03923192', 'NCT03862534', 'NCT03923478', 'NCT03868813', 'NCT03926806', 'NCT03925402', 'NCT03922230', 'NCT03869008', 'NCT03929692', 'NCT03920540', 'NCT03866291', 'NCT03928366', 'NCT03928769', 'NCT03921190', 'NCT03922581', 'NCT03863678', 'NCT03922113', 'NCT03920917', 'NCT03861325', 'NCT03869723', 'NCT03861923', 'NCT03925584', 'NCT03920033', 'NCT03928834', 'NCT03869086', 'NCT03861195', 'NCT03928613', 'NCT03864289', 'NCT03927066', 'NCT03929315', 'NCT03925103', 'NCT03860

In [51]:
# make yourself a new folder somewhere to store model information 
os.chdir('/Users/meldrumapple/Desktop/Capstone/mod_lct_chia') #change your directory to this folder
doc_bin_train.to_disk('training_data.spacy'); # save the docbin objects to this folder
doc_bin_test.to_disk('test_data.spacy');

In [10]:
import spacy 
import os
os.chdir('/Users/meldrumapple/Desktop/Capstone/lct_chia_mod')
nlp_ner = spacy.load("model-best")

doc= nlp_ner(
    '''
    Patients with biopsy-proven metastatic carcinoid tumors or other neuroendocrine tumors (Islet cell, Gastrinomas and VIPomas) with at least one measurable lesion (other than bone) that has either not been previously irradiated or if previously irradiated has demonstrated progression since the radiation therapy 
The patient has no major impairment of renal or hepatic function, as defined by the following laboratory parameters: total bilirubin <1.5 X ULN; AST, ALT<2.5X ULN (<5 X ULN if liver metastases are present) 
Patients on Sandostatin Lar (long acting somatostatin analogue) must be on a stable dose for 30 days prior to study entry and short acting somatostatin analogues must be judged to be on a clinically stable dose by the investigator prior to study entry 
Must have a life expectancy of greater than three (3) months 
Karnofsky Performance Status > 60 
Female patients must have a negative serum pregnancy test at screening. (Not applicable to patients with bilateral oophorectomy and/or hysterectomy or to those patients who are postmenopausal.) 
'''
)

spacy.displacy.render(doc, style="ent", jupyter=True)
