# Converting Brat ANN files to SpaCy-format dictionaries

Author: Kate Meldrum kmm4ap@virginia.edu

Description: Function to convert brat ann files into dictionaries in spacy training format

Need to know: 
- these .ann and .txt files are all named by an NCTID
- .ann and .txt files must be in the same folder

In [11]:
import pandas as pd
import os

In [15]:
def ann_to_dict(nct_id, file_path):
    '''
    nct_id: file name before .ann or .txt
    file_path: pathname to where .ann and .txt files are stored
    return: formatted dictionary
    '''
    
    # change directory
    os.chdir(file_path)
    
    # read in files
    with open(nct_id+'.ann') as f1:
        ann=f1.read()
        f1.close()
    with open(nct_id+'.txt') as f2:
        txt=f2.read()
        f2.close()
        
    # ANN FILE MANIPULATION
    lines=ann.split('\n')
    ents=[]
    for i in range(len(lines)): 
        lines[i]=lines[i].split('\t') 
        if 'T' in lines[i][0]: 
            lines[i]=lines[i][1:] 
            x=lines[i][0].split(' ')
            x.append(lines[i][1])
            x= [k for k in x if ';' not in k]
            x=tuple([int(x[1]), int(x[2]), x[0]])
            ents.append(x)

    # put together into dict: 
    content={'entities': ents, 'text': txt}
    
    return content

In [16]:
nct_id='NCT03860038'
pathname='/Users/meldrumapple/Desktop/Capstone/lct_corpus/batch1'
ann_to_dict(nct_id, pathname)

{'entities': [(580, 600, 'Procedure'),
  (580, 600, 'Procedure-Name'),
  (637, 657, 'Procedure'),
  (637, 657, 'Procedure-Name'),
  (186, 205, 'Condition'),
  (147, 154, 'Condition'),
  (502, 520, 'Procedure'),
  (502, 520, 'Procedure-Name'),
  (899, 911, 'Procedure'),
  (899, 911, 'Procedure-Name'),
  (839, 846, 'Procedure'),
  (839, 846, 'Procedure-Name'),
  (43, 49, 'Life-Stage-And-Gender'),
  (35, 39, 'Life-Stage-And-Gender'),
  (207, 209, 'Condition'),
  (29, 30, 'Eq-Operator'),
  (254, 266, 'Eq-Operator'),
  (658, 664, 'Eq-Operator'),
  (925, 931, 'Eq-Operator'),
  (269, 275, 'Eq-Temporal-Unit'),
  (667, 673, 'Eq-Temporal-Unit'),
  (934, 939, 'Eq-Temporal-Unit'),
  (25, 28, 'Age'),
  (31, 33, 'Eq-Value'),
  (267, 268, 'Eq-Value'),
  (373, 374, 'Eq-Value'),
  (665, 666, 'Eq-Value'),
  (932, 933, 'Eq-Value'),
  (943, 944, 'Eq-Value'),
  (29, 33, 'Eq-Comparison'),
  (254, 275, 'Eq-Comparison'),
  (658, 673, 'Eq-Comparison'),
  (925, 939, 'Eq-Comparison'),
  (89, 91, 'Condition'),
  

In [17]:
nct_id='NCT03920748'
pathname='/Users/meldrumapple/Desktop/Capstone/lct_corpus/batch20'
ann_to_dict(nct_id, pathname)

{'entities': [(97, 107, 'Procedure'),
  (97, 107, 'Procedure-Name'),
  (158, 165, 'Eq-Comparison'),
  (158, 165, 'Eq-Temporal-Period'),
  (207, 214, 'Eq-Comparison'),
  (207, 214, 'Eq-Temporal-Period'),
  (279, 286, 'Eq-Comparison'),
  (279, 286, 'Eq-Temporal-Period'),
  (59, 69, 'Eq-Comparison'),
  (59, 69, 'Eq-Temporal-Period'),
  (25, 32, 'Life-Stage-And-Gender'),
  (37, 43, 'Life-Stage-And-Gender'),
  (79, 86, 'Procedure'),
  (79, 86, 'Procedure-Name'),
  (70, 78, 'Modifier'),
  (114, 132, 'Procedure'),
  (114, 132, 'Procedure-Name'),
  (180, 200, 'Condition'),
  (180, 200, 'Condition-Name'),
  (169, 179, 'Modifier'),
  (218, 240, 'Condition'),
  (218, 240, 'Condition-Name'),
  (242, 247, 'Condition'),
  (242, 247, 'Condition-Name'),
  (249, 266, 'Condition'),
  (249, 266, 'Condition-Name'),
  (304, 310, 'Observation'),
  (304, 310, 'Observation-Name'),
  (299, 303, 'Modifier'),
  (290, 294, 'Modifier'),
  (312, 320, 'Condition'),
  (312, 320, 'Condition-Name'),
  (322, 330, 'Condi