In [None]:
"""
!pip install spacy
!pip install numpy
#!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
import spacy.cli
spacy.cli.download("en_core_web_sm")
!pip list
"""
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_md-0.5.3.tar.gz

In [None]:
import csv
import numpy as np
import re
import spacy
#import en_core_web_sm
from spacy.lang.en import English
import os
import pickle

In [None]:
#Retrieve all relations present in a file
def file_reader(filename):

    #Get number of rows in file:
    with open(filename, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter='\t')
        row_count = sum(1 for row in reader)  # fileObject is your csv.reader
        csvfile.close()
        #import csv

    with open(filename, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter='\t')

        num_ADE_Relations = 0

        T_list = []
        E_list = []
        R_list = []
        for row in reader:
            firstWord = row[0]
            firstLetter = firstWord[0]
            if(firstLetter == "T"):
                tag = row[0]
                value = row[2]

                valueTypeAndSpan = row[1].split()
                valueType = valueTypeAndSpan[0]
                start_span = valueTypeAndSpan[1]
                end_span = valueTypeAndSpan[2]

                tempDict = {"Tag": tag, "Value": value, "Start_Span": start_span, "End_Span": end_span}
                T_list.append(tempDict)

            elif(firstLetter == "E"):
                tag = row[0]
                value = row[1]

                tempDict = {"Tag": tag, "Value": value}
                E_list.append(tempDict)

            elif(firstLetter == "R"):
                tag = row[0]

                relTypeandArgs = row[1].split()
                relationType = relTypeandArgs[0]
                arg1 = relTypeandArgs[1]
                arg2 = relTypeandArgs[2]
                if(relationType == 'ADE'):
                    num_ADE_Relations+=1
                tempDict = {"Tag": tag, "relationType": relationType, "arg1": arg1, "arg2":arg2}
                R_list.append(tempDict)
            else:
                print("Something went wrong")
    #final list is a list of lists; a list of dictionaries, where each dictionary represents one brat annotation (E.g. E or T or R)
    #Plus statistics
    num_E_tags = len(E_list)
    num_T_tags = len(T_list)
    num_R_tags = len(R_list)
    stats = {"num_E_tags":num_E_tags, "num_T_tags":num_T_tags, "num_R_tags":num_R_tags,"num_ADE_Relations":num_ADE_Relations}

    file_dict = {"T_list":T_list, "E_list":E_list, "R_list":R_list, "stats":stats}
    #print(finalList)
    return(file_dict)



#extract the spans for each ADE relation
def get_ADE_spans(finalList : list[dict]):
    T_list = finalList.get("T_list")
    E_list = finalList.get("E_list")
    R_list = finalList.get("R_list")

    ADE_spans = []
    for relation in R_list:
        if(relation.get("relationType") == "ADE"):
            #these args are E tags
            arg1 = relation.get("arg1").split(":")[1]
            arg2 = relation.get("arg2").split(":")[1]
            for event in E_list:
                #if(event.get("Tag") == arg1 or event.get("Tag") == arg2):
                if(event.get("Tag") == arg1):
                    pattern = r'T\d{1,3}'
                    matches = re.findall(pattern, event.get("Value"))
                    if(len(matches) > 0):
                        T = matches[0]
                        for entry in T_list:
                            if(entry.get("Tag") == T):
                                #print(entry.get("Value"))
                                #print(entry.get("Start_Span"))
                                #print(entry.get("End_Span"))
                                tempDict = {"Start_Span": int(entry.get("Start_Span")), "End_Span": int(entry.get("End_Span"))}
                                ADE_spans.append(tempDict)
    return(ADE_spans)



#Use Spacy to tokenize the file
def tokenize_file(text_file_name : str, spacy_tokenizer_name : str):
    """
    text_file_name : the filename of the file being tokenized as a string without the .txt suffix
    spacy_tokenizer_name: the name of the spacy tokenizer being used (e.g. "en_core_web_sm") 
    """
    import scispacy
    import spacy

    nlp = spacy.load(spacy_tokenizer_name)
    #nlp = en_core_sci_md.load()
    #nlp = English()
    tokenArr = []
    tokenStartArr = []
    tokenEndArr = []

    with open(text_file_name+".txt", 'r', encoding='utf-8', newline='\n') as text_file:
        doc = nlp(text_file.read())
        sents = list(doc.sents)

        for j in sents:
            tokenArr.append(j.text_with_ws)
            tokenStartArr.append(j.start_char)
            tokenEndArr.append(j.end_char)
        
        d = np.array([tokenArr, tokenStartArr, tokenEndArr], dtype='object')
    #each entry in newArr represnets a sentence; for i in newArr, i[0] is the entire doc raw string, i[1] is an array of all original span starts, i[2] is an array of all original span ends
    return(d)



#Put the ADE spans in order, numerically, from lowest to highest
def order_list(ADE_spans):
    temp = {frozenset(item.items()):item for item in ADE_spans}.values()
    temp = sorted(temp, key=lambda d: d['Start_Span']) 
    return(temp)
 
#validates retrieved strings from get_strings
#checks if there are any elements in the original spacy_arr that aren't present in the ADEstrings or no ADEstrings
#checks if there are any elements that are present in both ADE strings and noADE strings
#checks if the length of ADE strings + noADE strings adds up to the length of the spacy_arr
def validate_get_strings(spacy_arr, ADE_spans, ADE_list, noADE_list):
    ADE_strings = []
    for i in range(len(ADE_list)):
        ADE_strings.append(ADE_list[i].get("string"))
        
    noADE_strings = []
    for i in range(len(noADE_list)):
        noADE_strings.append(noADE_list[i].get("string"))
        
    setADE = set(ADE_strings)
    setNoADE = set(noADE_strings)
    setADE.update(setNoADE)
    setADE = list(setADE)
    for i in range(len((setADE))):
        setADE[i] = str(setADE[i]).strip()

    for i in range(len(spacy_arr[0])):
        spacy_arr[0][i] = str(spacy_arr[0][i]).strip()

    setAll = set(spacy_arr[0])
    setADE = set(setADE)
    temp = (setAll ^ setADE)
    
    if(len(temp) > 0):
        print("===================")
        print("Items in the original set that were not detected as ADE or noADE: ")
        print(len(temp))

        #original = ADE_strings[1]
        #print("HERE")
        #print(original)
        #print("HERE")
        #new = temp.pop()

        #print("Length of Original: " + str(len(original)))
        #print("Length of new: "+  str(len(new)))


        for i in temp:
            print('\n>>>>>')
            print(i)
        #for j in setAll:
        #    print('\n--------')
        #    print(j)
    setADE = set(ADE_strings)
    setNoADE = set(noADE_strings)
    temp = (setADE.intersection(setNoADE))
    if(len(temp) > 0):
        print("===================")
        print("Items that are in both ADE and noADE sets:")
        print(len(temp))
        for i in temp:
            print('\n>>>>>')
            print(i)

            
            
#uses ADE spans to retrive full tokens from spacy tokenized array
def get_strings(spacy_arr, ADE_spans, debug):
  #first, order the ADE spans array by start span
  #next, iterate through each elemnt in the spacy_arr (3 parallel arrays)
  #given the ordered arr ADE_spans, maintain a counter (i=0)
  #for the ith ADE span, check if the start span occurs inside the current spacy_arr element.
  #If the ADE span occurs inside the current lement:
    #if the end span occurs inside the current element, add the current element to the list of ADE spans, increment i, do not increment spacy_iterator (in case there is overlap between ADE elements)
    #if the end span does not occur inside the current element, check if there is a next element
      #if yes, combine the current elemnt and the next elemnt and add to the list of ADE spans. Increment i, do not increment spacy_iterator.
  #If the ADE span does not occur inside the current element, do not increment i, increment spacy_iterator.
    ADE_spans_counter = 0
    ADE_strings = []
    noADE_strings = []

    ADE_spans_in_same_token = 0
    
    multi_jaunt_counter = 0
    
    i = 0
    loop = 0
    while i < (len(spacy_arr[1])):
        
        if(ADE_spans_counter == len(ADE_spans)):
            #if all ADE strings have been fetched, label the remaining as noADE
            z = i
            while(z < len(spacy_arr[1])):
                noADE_dict = {"string":str(spacy_arr[0][z]), "start_span":spacy_arr[1][z], "end_span":spacy_arr[2][z]}
                noADE_strings.append(noADE_dict)
                #print(noADE_dict)
                z+=1
            break

        #spacy_arr[0][i] is the documents ith token
        #spacy_arr[1][i] is the documents ith start span
        #spacy_arr[2][i] is the documents ith start span

        ADE_start_span = int(ADE_spans[ADE_spans_counter].get("Start_Span"))
        ADE_end_span = int(ADE_spans[ADE_spans_counter].get("End_Span"))

        doc_start_span = spacy_arr[1][i]
        doc_end_span = spacy_arr[2][i]

        
        #print("Doc i")
        #print(i)
        if(debug):
            print("\n\nADE Start and end:")
            print(ADE_start_span)
            print(ADE_end_span)
            print("Doc start and end:")
            print(doc_start_span)
            print(doc_end_span)
            print(spacy_arr[0][i])
        
        #print(len(spacy_arr))
        #print(len(spacy_arr[0][0]))
        #print((spacy_arr[1][i]))
        #print((spacy_arr[2][i]))
        #print(spacy_arr[0][i])
        #print(spacy_arr[0][0])

        if(ADE_start_span >= doc_start_span and ADE_start_span <= doc_end_span):
            
            #if the current ADE span DOES end in the current doc span, then:
            ###Add the current doc span, move to the next ADE span
            if(ADE_end_span <= doc_end_span and ADE_end_span >= doc_start_span):
                #print(spacy_arr[0][i+1])
                ADE_dict = {"string":str(spacy_arr[0][i]), "doc_start_span":spacy_arr[1][i], "doc_end_span":spacy_arr[2][i]}
                ADE_strings.append(ADE_dict)
                ADE_spans_counter+=1
                if(debug):
                    print("Added ADE span due to beginning and ending in same token")
                #print(ADE_strings)
                #print('\n======')                
                #if the next ADE Span also occurs inside this doc token 
                ###then, add this token, incremeent ADE span, and increment the trakcer for number of ADE spans in the same token.
                while(True):
                    #If there are remaining ADE spans
                    if(ADE_spans_counter < len(ADE_spans)):
                        ADE_start_span = int(ADE_spans[ADE_spans_counter].get("Start_Span"))
                        ADE_end_span = int(ADE_spans[ADE_spans_counter].get("End_Span"))
                        #If the next ADE span occurs in the same token, add it 
                        if(ADE_end_span <= doc_end_span and ADE_end_span >= doc_start_span):
                            ADE_spans_in_same_token+=1 
                            ADE_spans_counter +=1
                            ADE_strings.append(ADE_dict)
                            if(debug):
                                print("Added additional ADE span due to ending in the same token")
                            #print(ADE_strings)
                            #print('\n======')                
                            
                        #If the next span does not occur in the same token, move on
                        else:
                            break
                    else:
                        break
                i+=1
                continue

            #if the current ADE span DOES NOT end in the current doc span
            ###Add each doc span and increment until the current ADE span ends. 
            elif(ADE_end_span > doc_end_span):
                temp = ADE_spans_counter
                multi_jaunt_counter+=1

                while(True):

                    ADE_start_span = int(ADE_spans[ADE_spans_counter].get("Start_Span"))
                    ADE_end_span = int(ADE_spans[ADE_spans_counter].get("End_Span"))
                    ADE_string = str(spacy_arr[0][i])

                    doc_start_span = spacy_arr[1][i]
                    doc_end_span = spacy_arr[2][i]
                    ADE_dict = {"string":ADE_string, "doc_start_span":doc_start_span, "doc_end_span":doc_end_span}
                    ADE_strings.append(ADE_dict)
                    if(debug):
                        print("Added ADE span due to NOT ending in same span")
                        
                        print("doc")
                        print(doc_start_span)
                        print(doc_end_span)
                        print("ADE")
                        print(ADE_start_span)
                        print(ADE_end_span)
                        
                    #print(ADE_string)
                                 
                   
                    i+=1
                    loop+=1
                    doc_start_span = spacy_arr[1][i]
                    doc_end_span = spacy_arr[2][i]

                    if(ADE_end_span <= doc_end_span and ADE_end_span >= doc_start_span):
                        ADE_spans_counter += 1
                        
                        #print(ADE_strings)
                        #print('\n======')   
                        #print("break")
                        
                        break
                #if the loop exits without finding a match; if the end span isnt found
                if(ADE_spans_counter == temp):
                    print("Error with a multi-token ADE span")
                    exit()
                else:
                  #if the while loop exits by finding a match
                    continue
                    
        #if the current span is not an ADE span, add it as a noADE span
        else:
            noADE_dict = {"string":str(spacy_arr[0][i]), "doc_start_span":spacy_arr[1][i], "doc_end_span":spacy_arr[2][i]}
            noADE_strings.append(noADE_dict)
        i+=1

    #if the amount of ADE spans collected is not the same as the amount of ADE R tags in the original file
    if(len(ADE_spans) != len(ADE_strings)):
        #if the amount of ADE spans collected is not the same as the amount of ADE spans found oriignally 
        if(len(ADE_spans) != ADE_spans_counter):
            print("Error!") 
    
    if(debug):
        print("==============")
        print("ADE Strings:")
        for k in ADE_strings:
            string = k.get("string")
            print("\n\nelement length:")
            print(len(string))
            print("\nelement:")
            print(string)
        print("==============")
        print("Non-ADE Strings:")
        for k in noADE_strings:
            string = k.get("string")
            print("\n\nelement length:")
            print(len(string))
            print("\nelement:")
            print(string)

        print("Total Strings:")
        print(len(spacy_arr[1]))
        print("Total ADE strings collected: ")
        print((len(ADE_strings)))
        print("Total nonADE strings collected: ")
        print(len(noADE_strings))
        print("Total original ADE Relations: ")
        print(len(ADE_spans))
        print("Loops:")
        print(loop)
    
    validate_get_strings(spacy_arr, ADE_spans, ADE_strings, noADE_strings)
    print(multi_jaunt_counter)
    return({"ADE_strings":ADE_strings, "noADE_strings":noADE_strings, "num_Multi_Token_ADE_Relations":loop})

In [None]:
def BRAT_parse(filename: str, debug : bool, spacy_tokenizer_name : str):
    """
    filename: absolute filepath to the ann/txt filepair to parse, sans any suffix (.txt or .ann)
    debug: If true, prints status over execution. 
    """
    
    file_dict = file_reader(filename+".ann")

    ADE_Spans = get_ADE_spans(file_dict)
    ADE_Spans = order_list(ADE_Spans)
    #print("ADE Spans:")
    #print(ADE_Spans)
    spacy_arr = []
    spacy_arr = tokenize_file(filename, spacy_tokenizer_name)
    
    ADE_noADE_dict = get_strings(spacy_arr, ADE_Spans, debug)

    #ADE_strings = ADE_noADE_dict.get("ADE_strings")
    #noADE_strings = ADE_noADE_dict.get("noADE_strings")

    return(ADE_noADE_dict)

def export_data(data, path):
    f = open(path+".pkl",'w')
    f.close()
    f = open(path+".pkl", "wb")
    pickle.dump(data, f)
    f.close()

def parse_files(directory, foldername, train_or_validation, debug, spacy_tokenizer_name : str):
    #directory should be same as parse_folders
    #foldername is the name of specific folder being parsed (e.g. physician)
    #train_or_validation should be "train" or "dev"
    importPath = directory+foldername
    ADE_noADE_dict_list = []
    loop_counter = 0
    for file in os.listdir(importPath):
        filenameArr = file.split('.')
        if(len(filenameArr) > 1):
            if(filenameArr[1] == 'ann'):
                print(filenameArr[0])
                parse_result = BRAT_parse(importPath+'/'+filenameArr[0], debug, spacy_tokenizer_name)
                loop_counter += (parse_result.get("num_Multi_Token_ADE_Relations"))
                del parse_result['num_Multi_Token_ADE_Relations']
                ADE_noADE_dict_list.append(parse_result)
                                            
                #print(loop_counter)
    exportPath = (directory.replace(train_or_validation,''))+'processed_data/'+train_or_validation+'3_' + foldername +'_' + spacy_tokenizer_name
    export_data([ADE_noADE_dict_list, loop_counter, foldername], exportPath)
    print("Total loops in "+ foldername+": "+ str(loop_counter))
    return(ADE_noADE_dict_list)

        
    
#input: a filepath to a folder entitled /train containing named folders (e.g. "Physician") containing ann txt filepairs
#output: a folder at the same filepath (same level as /train) named processed_data containing named folders (e.g. "Physician")
#containing pkl files containig a python list of shape: [ {ADE_strings: ["blah blah"], noADE_strings: ["blah blah"] } ]
def parse_folders(directory, train_or_validation, debug, spacy_tokenizer_name : str):
    #Directory should be the path to the folder that contains the folders with txt and ann files (e.g. "physiciain" etc.)
    #train_or_validation should be "train" or "dev". This string is used to remove the approrpiate string from the filepath where the folders are eventaully sacved.
    final_dict = {}

    for foldername in os.listdir(directory +train_or_validation+'/'):
        print('\n=============\nFolder name:')
        print(foldername)
        ADE_noADE_dict_list = parse_files(directory +train_or_validation+'/', foldername, train_or_validation, debug, spacy_tokenizer_name)
        
        """
        print('\n=============\nAll_files_Dict:')
        print(file_dict_list)
        print('\n==================\n')
        """;


In [None]:
def main(debug: bool, spacy_tokenizer_name : str):
    parse_folders('#should equal a local folder', 'train', debug, spacy_tokenizer_name)
    parse_folders('#should equal a local folder', 'test', debug, spacy_tokenizer_name)

#main(False, "en_core_web_sm")
main(False, "en_core_sci_md")