**Entity type extraction and embedding: STANDOFF format**

- Standoff dataset is formatted differently and the size of dataset is larger
- This module reads the directory, extracts the entity:entity_type mappings, and create randomized embedding for all entity types. 
- It will also save the list of all unique entities so it can re-used in the KB concept embedding that utilizes external KB as a part of augmenting the embedding.

In [1]:
import os
import numpy as np
import itertools

In [2]:
class EntityTypeEmbedderSTANDOFF:
    
    def __init__(self, inputDir, embeddingDim=50):
        self.embeddingDim = embeddingDim
        self.entityEmbeddings = {}
        self.tokenEntityMappings = {}
        self.entityTypes = set()
        self.inputDir = inputDir
        self.extractTokenEntityMapping()
    
    def extractTokenEntityMapping(self):
        numFiles = 0
        for dirpath, _, filenames in os.walk(self.inputDir):
            for filename in filenames:
                if filename.endswith(".ann"):
                    numFiles += 1
                    filePath = os.path.join(dirpath, filename)
                    with open(filePath, 'r', encoding='utf-8') as file:
                        for line in file:
                            if line.strip():  # non-empty line
                                parts = line.split("\t")
                                if len(parts) == 3 and parts[0][0] == "T":
                                    token, entityTypeParts = parts[-1].strip(), parts[1]
                                    middleParts = entityTypeParts.split()
                                    entityType = middleParts[0]
                                    self.tokenEntityMappings[token] = entityType
                                    if entityType not in self.entityEmbeddings:
                                        self.entityTypes.add(entityType)
                                        self.entityEmbeddings[entityType] = np.random.rand(self.embeddingDim)
        print(f"Processed {numFiles} .ann files.")
        uniqueTokens = len(self.tokenEntityMappings)
        uniqueEntities = len(self.entityTypes)
        print(f"Found {uniqueTokens} unique tokens which correspond to entities")
        print(f"Found {uniqueEntities} unique entities")

    def get_embedding(self, token):
        entityType = self.get_entity_type(token)
        return self.entityEmbeddings.get(entityType, np.zeros(self.embeddingDim))
    
    def get_entity_type(self, token):
        return self.tokenEntityMappings.get(token, 'None')
    
    def update_embedding(self, entityType, newEmbedding):
        if entityType in self.entityEmbeddings:
            self.entityEmbeddings[entityType] = newEmbedding
        else:
            raise ValueError(f"Entity type {entityType} not in embeddings dictionary")
    
    

In [3]:
input_dir_path = '../BME Corpora/MLEE-1.0.2-rev1/standoff/full/'
entity_type_embedder = EntityTypeEmbedderSTANDOFF(input_dir_path)

Processed 262 .ann files.
Found 3536 unique tokens which correspond to entities
Found 45 unique entities


In [4]:
print(entity_type_embedder.entityTypes)
print(dict(itertools.islice(entity_type_embedder.tokenEntityMappings.items(), 5)))

{'DNA_methylation', 'Gene_expression', 'Planned_process', 'Cellular_component', 'Multi-tissue_structure', 'Drug_or_compound', 'Synthesis', 'Reproduction', 'Localization', 'Death', 'Pathway', 'Cell', 'Organism_subdivision', 'Acetylation', 'Breakdown', 'Positive_regulation', 'Developing_anatomical_structure', 'Translation', 'Organism', 'Blood_vessel_development', 'Phosphorylation', 'Immaterial_anatomical_entity', 'DNA_domain_or_region', 'Regulation', 'Organism_substance', 'Gene_or_gene_product', 'Cell_proliferation', 'Negative_regulation', 'Growth', 'Remodeling', 'Anatomical_system', 'Dissociation', 'Ubiquitination', 'Protein_domain_or_region', 'Transcription', 'Catabolism', 'Tissue', 'Cell_division', 'Binding', 'Organ', 'Protein_processing', 'Dephosphorylation', 'Metabolism', 'Development', 'Pathological_formation'}
{'VEGF': 'Gene_or_gene_product', '-2': 'Gene_or_gene_product', 'angiopoietin-1': 'Gene_or_gene_product', 'microvascular': 'Tissue', 'Vascular endothelial growth factor': 'Ge

In [5]:
print(entity_type_embedder.get_entity_type("girl"))
print(entity_type_embedder.get_entity_type("scar"))
print(entity_type_embedder.get_entity_type("BUFFALO"))

Organism
Pathological_formation
None


In [6]:
print(entity_type_embedder.get_embedding("girl"))
print(entity_type_embedder.get_embedding("scar"))
print(entity_type_embedder.get_embedding("BUFFALO"))

[0.90381941 0.05520675 0.38885863 0.00775691 0.03873333 0.64282388
 0.97901437 0.01757973 0.80937793 0.46586224 0.79950041 0.4997252
 0.83540344 0.84451615 0.8190294  0.84905075 0.15398873 0.67135077
 0.74806231 0.04269398 0.38782199 0.61625893 0.00886134 0.49772937
 0.57221149 0.76673392 0.56187645 0.92328973 0.24910555 0.81654194
 0.52101219 0.68948749 0.56536296 0.92562362 0.22437851 0.50551397
 0.72594729 0.20374065 0.16094928 0.53240601 0.66624181 0.96269053
 0.87685808 0.28532431 0.71548562 0.4110614  0.27166422 0.60736476
 0.91969194 0.82832529]
[0.20785141 0.94065014 0.26893111 0.38905696 0.18102632 0.73163603
 0.43044407 0.09821244 0.34613842 0.49660312 0.53020372 0.69133059
 0.68562232 0.94752974 0.31162893 0.71325262 0.79995115 0.89910447
 0.21569636 0.97498468 0.48233304 0.79669675 0.55074161 0.61464306
 0.5333954  0.49599483 0.20194718 0.72077952 0.71257108 0.70508043
 0.16268669 0.57987631 0.54336759 0.84603937 0.30396602 0.26009342
 0.51493521 0.67146664 0.48344693 0.501