In [14]:
from pathlib import Path
import numpy as np
import gensim

base_path = Path("go-basic")

In [15]:
wv = gensim.models.KeyedVectors.load(str(base_path / "owl2vec" / "ontology.embeddings"), mmap="r")

In [16]:
import logging
from rich.logging import RichHandler

logging.basicConfig(level="INFO", format="%(message)s", handlers=[RichHandler()])

from pathlib import Path
import numpy as np
import gensim
from nltk import word_tokenize
import re

# Load the model
wv = gensim.models.KeyedVectors.load(str(base_path / "owl2vec" / "ontology.embeddings"), mmap="r")

# Helper functions from OWL2Vec_Plus.py
def pre_process_words(words):
    text = ' '.join([re.sub(r'https?:\/\/.*[\r\n]*', '', word, flags=re.MULTILINE) for word in words])
    tokens = word_tokenize(text)
    processed_tokens = [token.lower() for token in tokens if token.isalpha()]
    return processed_tokens

n_s = []
def embed(model, instances, uri_label=None):
    def word_embeding(inst):
        v = np.zeros(model.vector_size)
        if uri_label and inst in uri_label:
            words = uri_label.get(inst)
            n = 0
            for word in words:
                if word in model.wv.index_to_key:
                    v += model.wv.get_vector(word)
                    n += 1
            return (v / n if n > 0 else v, n)
        else:
            return (v, 0)

    feature_vectors = []
    for instance in instances:
        # Get both URI and word embeddings
        # v_uri = model.get_vector(instance) if instance in model.index_to_key else np.zeros(model.vector_size)
        v_word, n = word_embeding(inst=instance)
        n_s.append(n)
        # Concatenate them
        feature_vectors.append(v_word)

    return feature_vectors

# Load classes and annotations
classes = [line.strip() for line in open(base_path / "split" / "classes.txt").readlines()]
uri_label = {}
for line in open(base_path / "cache" / "annotations.txt").readlines():
    tmp = line.strip().split()
    if tmp[1] == "preferred_label":
        if tmp[0] not in uri_label:
            uri_label[tmp[0]] = []
        uri_label[tmp[0]] += pre_process_words(tmp[2:])

# Calculate embeddings
classes_e = embed(model=wv, instances=classes, uri_label=uri_label)

logging.info(f"Average number of words per class: {np.mean(n_s)}")
# Extract GO IDs from class URIs and create dictionary
go_ids = [cls.split('/')[-1] for cls in classes]
embeddings_dict = {go_id: emb for go_id, emb in zip(go_ids, classes_e)}

# Save embeddings
np.save(str(base_path / "owl2vec" / "ontology.embeddings.npy"), embeddings_dict)

print(f"Saved embeddings for {len(embeddings_dict)} classes")
print(f"Each embedding has dimension: {len(next(iter(embeddings_dict.values())))} (URI + word embeddings)")

Saved embeddings for 51550 classes
Each embedding has dimension: 100 (URI + word embeddings)


In [10]:
uri_label

{'http://purl.obolibrary.org/obo/GO_0005623': ['obsolete',
  'cell',
  'cell',
  'and',
  'encapsulating',
  'structures'],
 'http://purl.obolibrary.org/obo/IAO_0000115': ['definition'],
 'http://purl.obolibrary.org/obo/IAO_0000233': ['term', 'tracker', 'item'],
 'http://purl.obolibrary.org/obo/IAO_0000700': ['has',
  'ontology',
  'root',
  'term'],
 'http://purl.obolibrary.org/obo/IAO_0100001': ['term', 'replaced', 'by'],
 'http://purl.obolibrary.org/obo/go#syngo_official_label': ['label',
  'approved',
  'by',
  'the',
  'syngo',
  'project'],
 'http://purl.obolibrary.org/obo/go#systematic_synonym': ['systematic',
  'synonym'],
 'http://www.geneontology.org/formats/oboInOwl#SubsetProperty': [],
 'http://www.geneontology.org/formats/oboInOwl#SynonymTypeProperty': [],
 'http://www.geneontology.org/formats/oboInOwl#consider': ['consider'],
 'http://www.geneontology.org/formats/oboInOwl#hasAlternativeId': [],
 'http://www.geneontology.org/formats/oboInOwl#hasBroadSynonym': [],
 'http://

In [17]:
base_path

PosixPath('go-basic')