**Parsing annotations for Entities and Events from the .ann files**

- This class handles reading the .ann files that are provided in the standoff/full directory
- .ann files have both entities and events
- Each unique line that has an entity starts with T 
- Each unique line that has an event starts with E


In [1]:
import os
import json

In [2]:
class AnnotationReader:
    def __init__(self):
        pass
    
    def readAnnotations(self, filePath):
        with open(filePath, 'r', encoding='utf-8') as file:
            return file.read()
    
    def parseAnnotations(self, content):
        entities = {}
        events = {}
        
        for line in content.strip().split('\n'):
            if line.startswith('T'):
                parts = line.split('\t')
                if len(parts) != 3:
                    continue;
                entityId, entityInfo, entityName = parts
                entityType, start, end = entityInfo.split(' ')
                entities[entityId] = {'type': entityType, 'span': (int(start), int(end)), 'entity': entityName}
            elif line.startswith('E'):
                parts = line.split('\t')
                eventId, eventInfo = parts
                eventInfoParts = eventInfo.split(' ')
                if len(eventInfoParts) != 3:
                    continue
                eventTypeTrigger = eventInfoParts[0].split(':')
                eventArgs = [arg.split(':') for arg in eventInfoParts[1:]]
                events[eventId] = {'type': eventTypeTrigger[0], 'trigger': eventTypeTrigger[1], 'args': eventArgs}
        return entities, events
    
    def countTotalEvents(self, baseDir):
        totalTriggers = 0
        printFirst = True
        for root, dirs, files in os.walk(baseDir):
            for file in files:
            # Check if the file has .ann extension
                if file.endswith(".ann"):
                    file_path = os.path.join(root, file)
                    with open(file_path, 'r') as f:
                        content = f.read()
                        entities, events = self.parseAnnotations(content)
                        if printFirst:
                            self.printFirstFileAnnotations(entities, events)
                            printFirst = False
                        triggerCount = len(events)
                        totalTriggers += triggerCount
        return f"Total Trigger count: {totalTriggers}"
    
    def printFirstFileAnnotations(self, entities, events):
        print("="*100)
        print("Entities - example")
        print("="*100)
        print(json.dumps(entities, indent=2))
        print()
        print("="*100)
        print()
        print("Events - example")
        print("="*100)
        print(json.dumps(events, indent=2))
        print("="*100)
        print()

In [3]:
baseDir = '../BME Corpora/MLEE-1.0.2-rev1/standoff/full/'
annotationReader = AnnotationReader()
triggerCount = annotationReader.countTotalEvents(baseDir)
print(triggerCount)

Entities - example
{
  "T1": {
    "type": "Gene_or_gene_product",
    "span": [
      0,
      4
    ],
    "entity": "VEGF"
  },
  "T2": {
    "type": "Gene_or_gene_product",
    "span": [
      25,
      27
    ],
    "entity": "-2"
  },
  "T3": {
    "type": "Gene_or_gene_product",
    "span": [
      6,
      20
    ],
    "entity": "angiopoietin-1"
  },
  "T7": {
    "type": "Multi-tissue_structure",
    "span": [
      98,
      111
    ],
    "entity": "microvascular"
  },
  "T11": {
    "type": "Multi-tissue_structure",
    "span": [
      149,
      162
    ],
    "entity": "microvascular"
  },
  "T12": {
    "type": "Gene_or_gene_product",
    "span": [
      302,
      336
    ],
    "entity": "Vascular endothelial growth factor"
  },
  "T13": {
    "type": "Gene_or_gene_product",
    "span": [
      338,
      342
    ],
    "entity": "VEGF"
  },
  "T19": {
    "type": "Cell",
    "span": [
      399,
      415
    ],
    "entity": "endothelial cell"
  },
  "T22": {
    "t