**Trigger Identification**

- Trigger identification is the main task in the Reinforcement Learning part of the implementation
- This step will read the sentences in the text, break a sentence into tokens, use the embedding generated by the KB Augmented Representation module to detect if a given token in a sentence is a trigger or not. 
- The set of possible actions include all identified event types and an additional 'None' type if a token is non-trigger. 
- We will be utilizing a few classes that will read and parse the input data, and prepare the text data and provided annotations for RL environment.


In [1]:
class TextFileReader:
    def __init__(self, filePath):
        self.filePath = filePath
    
    def readText(self):
        with open(self.filePath, 'r', encoding='utf-8') as file:
            return file.read()

In [2]:
class AnnotationReader:
    def __init__(self, filePath):
        self.filePath = filePath
    
    def readAnnotations(self):
        with open(self.filePath, 'r', encoding='utf-8') as file:
            return file.read()
    
    def parseAnnotations(self, content):
        entities = {}
        events = {}
        
        for line in content.strip().split('\n'):
            if line.startswith('T'):
                parts = line.split('\t')
                if len(parts) != 3:
                    continue;
                entityId, entityInfo, _ = parts
                entityType, start, end = entityInfo.split(' ')
                entities[entityId] = {'type': entityType, 'span': (int(start), int(end))}
            elif line.startswith('E'):
                parts = line.split('\t')
                eventId, eventInfo = parts
                eventInfoParts = eventInfo.split(' ')
                if len(eventInfoParts) != 3:
                    continue
                eventTypeTrigger = eventInfoParts[0].split(':')
                eventArgs = [arg.split(':') for arg in eventInfoParts[1:]]
                events[eventId] = {'type': eventTypeTrigger[0], 'trigger': eventTypeTrigger[1], 'args': eventArgs}
        return entities, events
    
    

In [3]:
annotationReader = AnnotationReader('../BME Corpora/MLEE-1.0.2-rev1/standoff/full/PMID-1869637.ann')
annotationContent = annotationReader.readAnnotations()
entities, events = annotationReader.parseAnnotations(annotationContent)

textReader = TextFileReader('../BME Corpora/MLEE-1.0.2-rev1/standoff/full/PMID-1869637.txt')
text = textReader.readText()

print(entities)
print(events)

{'T2': {'type': 'Cell', 'span': (46, 62)}, 'T3': {'type': 'Multi-tissue_structure', 'span': (76, 84)}, 'T4': {'type': 'Anatomical_system', 'span': (217, 239)}, 'T5': {'type': 'Cell', 'span': (249, 260)}, 'T6': {'type': 'Multi-tissue_structure', 'span': (275, 288)}, 'T7': {'type': 'Multi-tissue_structure', 'span': (396, 404)}, 'T9': {'type': 'Cell', 'span': (474, 490)}, 'T10': {'type': 'Multi-tissue_structure', 'span': (623, 631)}, 'T11': {'type': 'Tissue', 'span': (659, 676)}, 'T12': {'type': 'Gene_or_gene_product', 'span': (712, 723)}, 'T13': {'type': 'Gene_or_gene_product', 'span': (1075, 1083)}, 'T14': {'type': 'Cell', 'span': (1088, 1099)}, 'T15': {'type': 'Cellular_component', 'span': (1156, 1176)}, 'T16': {'type': 'Cell', 'span': (1181, 1197)}, 'T17': {'type': 'Tissue', 'span': (1275, 1286)}, 'T18': {'type': 'Tissue', 'span': (1314, 1325)}, 'T19': {'type': 'Cell', 'span': (1394, 1411)}, 'T21': {'type': 'Cell', 'span': (1442, 1458)}, 'T22': {'type': 'Gene_or_gene_product', 'span':