# Setup

In [1]:
from pathlib import Path
import numpy as np
import gensim

base_path = Path("go-full")

In [2]:
wv = gensim.models.KeyedVectors.load(str(base_path / "owl2vec" / "ontology.embeddings"), mmap="r")

In [25]:
import logging
from rich.logging import RichHandler

logging.basicConfig(level="INFO", format="%(message)s", handlers=[RichHandler()])

from pathlib import Path
import numpy as np
import gensim
from nltk import word_tokenize
import re

# Load the model
wv = gensim.models.KeyedVectors.load(str(base_path / "owl2vec" / "ontology.embeddings"), mmap="r")


# Helper functions from OWL2Vec_Plus.py
def pre_process_words(words):
    text = ' '.join([re.sub(r'https?:\/\/.*[\r\n]*', '', word, flags=re.MULTILINE) for word in words])
    tokens = word_tokenize(text)
    processed_tokens = [token.lower() for token in tokens if token.isalpha()]
    return processed_tokens


# Load classes and annotations
classes = [line.strip() for line in open(base_path / "split" / "classes.txt").readlines()]
uri_label = {}

label_words = 0
available_words = 0
preproc_words = 0
multilabels = []
for line in open(base_path / "cache" / "annotations.txt").readlines():
    tmp = line.strip().split()
    if tmp[1] == "preferred_label":
        uri_label[tmp[0]] = pre_process_words(tmp[2:])

        if tmp[0] in uri_label and tmp[0] not in multilabels:
            multilabels.append(tmp[0])
        else:
            preproc_words += len(uri_label[tmp[0]])
        label_words += len(tmp) - 2
        available_words += len(tmp) - 2
    else:
        available_words += len(tmp) - 2

logging.info(f"Average number of label words available per class: {label_words / len(classes)}")
logging.info(f"Average number of words available per class: {available_words / len(classes)}")


n_s = []
lost_words = 0
def embed(model, instances, uri_label: dict):
    def word_embeding(inst):
        v = np.zeros(model.vector_size)
        if inst in uri_label:
            words = uri_label.get(inst)
            n = 0
            for word in words:
                if word in model.wv.index_to_key:
                    v += model.wv.get_vector(word)
                    n += 1
                else:
                    lost_words += 1
            return (v / n if n > 0 else v, n)
        else:
            return (v, 0)

    feature_vectors = []
    for instance in instances:
        # Get both URI and word embeddings
        # v_uri = model.get_vector(instance) if instance in model.index_to_key else np.zeros(model.vector_size)
        v_word, n = word_embeding(inst=instance)
        n_s.append(n)
        # Concatenate them
        feature_vectors.append(v_word)

    return feature_vectors


# Calculate embeddings
classes_e = embed(model=wv, instances=classes, uri_label=uri_label)
logging.info(f"Number of words lost: {lost_words}")
logging.info(f"Average number of words used per class: {np.mean(n_s)}")
# Extract GO IDs from class URIs and create dictionary
go_ids = [cls.split('/')[-1] for cls in classes]
embeddings_dict = {go_id: emb for go_id, emb in zip(go_ids, classes_e)}

# Save embeddings
np.save(str(base_path / "owl2vec" / "ontology.embeddings.npy"), embeddings_dict)

print(f"Saved embeddings for {len(embeddings_dict)} classes")
print(f"Each embedding has dimension: {len(next(iter(embeddings_dict.values())))} (word embeddings)")

Saved embeddings for 51550 classes
Each embedding has dimension: 100 (word embeddings)


In [26]:
duplicates = []
for line in open(base_path / "cache" / "annotations.txt").readlines():
    tmp = line.strip().split()
    if tmp[0] in multilabels:
        duplicates.append(tmp)
logging.info(f"Number of duplicate classes: {len(duplicates)}")


In [27]:
duplicates

[['http://purl.obolibrary.org/obo/GO_0005623',
  'preferred_label',
  'obsolete',
  'cell'],
 ['http://purl.obolibrary.org/obo/GO_0005623',
  'preferred_label',
  'cell',
  'and',
  'encapsulating',
  'structures'],
 ['http://purl.obolibrary.org/obo/IAO_0000115',
  'preferred_label',
  'definition'],
 ['http://purl.obolibrary.org/obo/IAO_0000233',
  'preferred_label',
  'term',
  'tracker',
  'item'],
 ['http://purl.obolibrary.org/obo/IAO_0000700',
  'preferred_label',
  'has',
  'ontology',
  'root',
  'term'],
 ['http://purl.obolibrary.org/obo/IAO_0100001',
  'preferred_label',
  'term',
  'replaced',
  'by'],
 ['http://purl.obolibrary.org/obo/go#syngo_official_label',
  'preferred_label',
  'label',
  'approved',
  'by',
  'the',
  'SynGO',
  'project'],
 ['http://purl.obolibrary.org/obo/go#systematic_synonym',
  'preferred_label',
  'Systematic',
  'synonym'],
 ['http://www.geneontology.org/formats/oboInOwl#SubsetProperty',
  'preferred_label',
  'subset_property'],
 ['http://www.g

In [24]:
preproc_words / len(classes)

4.701668283220174