In [2]:
import pandas as pd
import os

In [3]:
def ann_to_dict(nct_id): #file_path
    '''
    nct_id: file name before .ann or .txt
    file_path: pathname to where .ann and .txt files are stored
    return: formatted dictionary
    '''
    
    # change directory
    #os.chdir(file_path)
    
    # read in files
    with open(nct_id+'.ann') as f1:
        ann=f1.read()
        f1.close()
    with open(nct_id+'.txt') as f2:
        txt=f2.read()
        f2.close()
        
    # ANN FILE MANIPULATION
    lines=ann.split('\n')
    ents=[]
    for i in range(len(lines)): 
        lines[i]=lines[i].split('\t') 
        if 'T' in lines[i][0]: 
            lines[i]=lines[i][1:] 
            x=lines[i][0].split(' ')
            x.append(lines[i][1])
            x= [k for k in x if ';' not in k]
            x=tuple([int(x[1]), int(x[2]), x[0]])
            ents.append(x)

    # put together into dict: 
    content={'entities': ents, 'text': txt}
        
    return content

In [4]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en") # load a new spacy model
doc_bin = DocBin() # create a DocBin object

In [5]:
os.chdir('/Users/meldrumapple/Desktop/Capstone/chia_with_scope') # change to wherever you have the chia_with_scope folder downloaded
doc_list=os.listdir()
for i in range(len(doc_list)):
    doc_list[i]=doc_list[i][0:-4]
doc_list=[x for x in doc_list if 'NCT' in x]

In [6]:
from spacy.util import filter_spans

for doc in doc_list[0:10]:
    print(doc)
    doc_dict=ann_to_dict(doc)
    text = doc_dict['text']
    labels = doc_dict['entities']
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents 
    doc_bin.add(doc)

NCT01352598_exc
NCT00894712_inc
NCT00344318_inc
NCT01051414_exc
NCT02056288_exc
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
NCT03233880_inc
NCT03463564_inc
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
NCT02678377_exc
Skipping entity
Skipping entity
NCT02242188_inc
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
NCT02167022_inc


In [None]:
os.chdir('/Users/meldrumapple/Desktop/Capstone/chia_mod') # make yourself a new folder somewhere to store model information 
doc_bin.to_disk("training_data.spacy"); # save the docbin object

Next you need to make a new file with the following text (just create a new text file, copy it in). 

Do this in your chia_mod folder or wherever you saved the training data

rename the txt file to 'base_config.cfg'

Next in the line 'train=', you need to replace null with your pathname (as a string!) to your base config file, so mine looks like: 

Then you need to run the following line in your anaconda terminal, so click on 'environments' on the far left  of your anaconda app and then on pymc3_env or whatever your current environment with spacy installed is. Hit the green play button and click 'open terminal'. Run this line there: 

Then if that works run this in the same terminal: 

python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy 

If all the terminal stuff works, try running this to test it: 

In [None]:
nlp_ner = spacy.load("model-best")

doc= nlp_ner(
    '''
    Inclusion Criteria:
  -  Age 13-17 years or 18-20 if still living at home with parent.
  -  Agreement of at least one responsible adult parent/caregiver to participate in treatment.
  -  Agreement to take part in assessments, videotaping/audiotaping and coding of their sessions by research personnel.
  -  Agreement to pay for mental health services at the DBT- RU on a sliding scale, and to participate in research assessments as volunteers.
  -  Residence within commuting distance of clinic (< 45 minutes).
  -  Agreement to discontinue other forms of talk therapy for duration of DBT program (does not refer to AA/NA programs or psychotropic medication management).
  -  Exhibits dysregulation within the past six months as evidenced by 1) intentional self-injury and/or suicide attempt and/or 2) substance use disorder.
  -  Meets two additional criteria for borderline personality disorder (BPD).
Exclusion Criteria:
  -  Clients who need mental health services not available at the DBT- RU, such as treatment for schizophrenia or life-threatening anorexia, or who are currently obtaining optimum professional treatment that should not be ended.
  -  Non-English speaking.
  -  IQ < 70.
  -  Unable to understand research consent forms.
  -  Court-ordered to participate in treatment.'''
)

spacy.displacy.render(doc, style="ent", jupyter=True)
