# Setup


In [2]:
import numpy as np
import pandas as pd
import gensim
from pathlib import Path
%load_ext autoreload
%autoreload 2

base_path = Path("go-basic")

In [3]:
all_lines = []
for file in base_path.glob("split/*.csv"):
  with open(file) as f:
    for line in f:
      all_lines.append(line.strip())

all_gos = set([line.split(',')[0] for line in all_lines]) | set([line.split(',')[1] for line in all_lines])

In [4]:

from embedding_evaluation import EmbeddingDataset

dataset = EmbeddingDataset(base_path)

dataset.load_owl2vec(base_path / "owl2vec" / "ontology.embeddings.npy")
dataset.load_anc2vec(base_path / "anc2vec" / "ontology.embeddings.npy")
dataset.load_biobert(base_path / "biobert" / "ontology.embeddings.npy")
dataset.load_gt2vec(base_path / "gt2vec" / "ontology.embeddings.npy")



# Validate embeddings

In [10]:
missing = {
  'owl2vec': set(),
  'anc2vec': set(),
  'biobert': set(),
  'gt2vec': set()
}

def is_missing(id, emb):
  return id not in emb or np.all(emb[id] == 0)

for go in all_gos:
  id = go.split('/')[-1]

  if is_missing(id, dataset.embeddings['owl2vec']):
    missing['owl2vec'].add(go)
  if is_missing(id, dataset.embeddings['anc2vec']):
    missing['anc2vec'].add(go)
  if is_missing(id, dataset.embeddings['biobert']):
    missing['biobert'].add(go)
  if is_missing(id, dataset.embeddings['gt2vec']):
    missing['gt2vec'].add(go)

print(f"Total GOs: {len(all_gos)} in {base_path}")
print(f"Missing in OWL2Vec: {len(missing['owl2vec'])} / {len(all_gos)} ({len(missing['owl2vec']) / len(all_gos):.2%})")
print(f"Missing in Anc2Vec: {len(missing['anc2vec'])} / {len(all_gos)} ({len(missing['anc2vec']) / len(all_gos):.2%})")
print(f"Missing in BioBERT: {len(missing['biobert'])} / {len(all_gos)} ({len(missing['biobert']) / len(all_gos):.2%})")
print(f"Missing in GT2Vec: {len(missing['gt2vec'])} / {len(all_gos)} ({len(missing['gt2vec']) / len(all_gos):.2%})")


Total GOs: 47050 in go-basic
Missing in OWL2Vec: 2112 / 47050 (4.49%)
Missing in Anc2Vec: 6111 / 47050 (12.99%)
Missing in BioBERT: 2074 / 47050 (4.41%)
Missing in GT2Vec: 4495 / 47050 (9.55%)
