**Entity type extraction and embedding: STANDOFF format**

- Standoff dataset is formatted differently and the size of dataset is larger
- This module reads the directory, extracts the entity:entity_type mappings, and create randomized embedding for all entity types. 
- It will also save the list of all unique entities so it can re-used in the KB concept embedding that utilizes external KB as a part of augmenting the embedding.

In [None]:
import os
import numpy as np
import itertools

In [None]:
class EntityTypeEmbedderSTANDOFF:
    
    def __init__(self, inputDir, embeddingDim=50):
        self.embeddingDim = embeddingDim
        self.entityEmbeddings = {}
        self.tokenEntityMappings = {}
        self.entityTypes = set()
        self.inputDir = inputDir
        self.extractTokenEntityMapping()
    
    def extractTokenEntityMapping(self):
        numFiles = 0
        for dirpath, _, filenames in os.walk(self.inputDir):
            for filename in filenames:
                if filename.endswith(".ann"):
                    numFiles += 1
                    filePath = os.path.join(dirpath, filename)
                    with open(filePath, 'r', encoding='utf-8') as file:
                        for line in file:
                            if line.strip():  # non-empty line
                                parts = line.split("\t")
                                if len(parts) == 3 and parts[0][0] == "T":
                                    token, entityTypeParts = parts[-1].strip(), parts[1]
                                    middleParts = entityTypeParts.split()
                                    entityType = middleParts[0]
                                    self.tokenEntityMappings[token] = entityType
                                    if entityType not in self.entityEmbeddings:
                                        self.entityTypes.add(entityType)
                                        self.entityEmbeddings[entityType] = np.random.rand(self.embeddingDim)
        print(f"Processed {numFiles} .ann files.")
        uniqueTokens = len(self.tokenEntityMappings)
        uniqueEntities = len(self.entityTypes)
        print(f"Found {uniqueTokens} unique tokens which correspond to entities")
        print(f"Found {uniqueEntities} unique entities")

    def get_embedding(self, token):
        entityType = self.get_entity_type(token)
        return self.entityEmbeddings.get(entityType, np.zeros(self.embeddingDim))
    
    def get_entity_type(self, token):
        return self.tokenEntityMappings.get(token, 'None')
    
    def update_embedding(self, entityType, newEmbedding):
        if entityType in self.entityEmbeddings:
            self.entityEmbeddings[entityType] = newEmbedding
        else:
            raise ValueError(f"Entity type {entityType} not in embeddings dictionary")
    
    

In [None]:
input_dir_path = '../BME Corpora/MLEE-1.0.2-rev1/standoff/full/'
entity_type_embedder = EntityTypeEmbedderSTANDOFF(input_dir_path)

In [None]:
print(entity_type_embedder.entityTypes)
print(dict(itertools.islice(entity_type_embedder.tokenEntityMappings.items(), 5)))

In [None]:
print(entity_type_embedder.get_entity_type("girl"))
print(entity_type_embedder.get_entity_type("scar"))
print(entity_type_embedder.get_entity_type("BUFFALO"))

In [None]:
print(entity_type_embedder.get_embedding("girl"))
print(entity_type_embedder.get_embedding("scar"))
print(entity_type_embedder.get_embedding("BUFFALO"))