**Entity type embedding for CONLL format data**

- This class generates entity type embedding for a given token
- It is necessary to initialize the class, as it will load all the entity types from the dataset, before generating an embedding for a given token.
- Note that this class only handles MLEE/conll files

In [None]:
import os
import numpy as np

In [None]:
class EntityTypeEmbedderCONLL:
    
    def __init__(self, inputDir, embeddingDim=50):
        self.embeddingDim = embeddingDim
        self.entityEmbeddings = {}
        self.tokenEntityMappings = {}
        self.entityTypes = set()
        self.inputDir = inputDir
        self.extractTokenEntityMapping()
    
    def extractTokenEntityMapping(self):
        numFiles = 0
        for dirpath, _, filenames in os.walk(self.inputDir):
            for filename in filenames:
                if filename.endswith(".conll"):
                    numFiles += 1
                    filePath = os.path.join(dirpath, filename)
                    with open(filePath, 'r', encoding='utf-8') as file:
                        for line in file:
                            if line.strip():  # non-empty line
                                parts = line.split("\t")
                                if len(parts) == 4:
                                    token, entityType = parts[0], parts[-1].strip()
                                    self.tokenEntityMappings[token] = entityType
                                    if entityType not in self.entityEmbeddings and entityType != 'O':
                                        self.entityTypes.add(entityType)
                                        self.entityEmbeddings[entityType] = np.random.rand(self.embeddingDim)
        print(f"Processed {numFiles} .conll files.")
        uniqueTokens = len(self.tokenEntityMappings)
        uniqueEntities = len(self.entityTypes)
        print(f"Found {uniqueTokens} unique tokens which correspond to entities != O")
        print(f"Found {uniqueEntities} unique entities")
    
    def get_embedding(self, token):
        entityType = self.tokenEntityMappings.get(token, 'O')
        return self.entityEmbeddings.get(entityType, np.zeros(self.embeddingDim))
    
    def get_entity_type(self, token):
        return self.tokenEntityMappings.get(token, 'O')
    
    def update_embedding(self, entityType, newEmbedding):
        if entityType in self.entityEmbeddings:
            self.entityEmbeddings[entityType] = newEmbedding
        else:
            raise ValueError(f"Entity type {entityType} not in embeddings dictionary")

In [None]:
input_dir_path = '../BME Corpora/MLEE-1.0.2-rev1/conll/full/'
entity_type_embedder = EntityTypeEmbedderCONLL(input_dir_path)

In [None]:
# print(entity_type_embedder.token_entity_mapping)

example_embedding_girl = entity_type_embedder.get_embedding('girl') #B-organism
example_embedding_rat = entity_type_embedder.get_embedding('rat') # B-organism
print(example_embedding_girl)
print(example_embedding_rat)
print("*********")

example_embedding_vein = entity_type_embedder.get_embedding('vein') #I-Cell
example_embedding_umbilical = entity_type_embedder.get_embedding('umbilical') #I-Cell
print(example_embedding_vein)
print(example_embedding_umbilical)
print("*********")

example_embedding_f = entity_type_embedder.get_embedding('following')
example_embedding_r = entity_type_embedder.get_embedding('rubella')
example_embedding_e = entity_type_embedder.get_embedding('eye')
print(example_embedding_f)
print(example_embedding_r)
print(example_embedding_e)
