In [7]:
import wandb
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from typing import List, Dict

# Initialize wandb
api = wandb.Api()

# Get runs from your project
runs = api.runs("go-embedding-evaluation")  # Replace with your actual project name
runs = [(run.config, run.history()) for run in runs]


In [27]:

# Collect data
data = []
for config, history in runs:
    # Get the last logged metrics
    if not history.empty:
        # Extract metrics for each embedding type and model
        for _, metrics in history.iterrows():
            metrics_dict = metrics.to_dict()
            for key, value in metrics_dict.items():
                if isinstance(value, (int, float)):  # Filter out non-numeric values
                    # Parse the metric key (format: "embedding_type/model/metric")
                    parts = key.split('/')
                    # if len(parts) == 3:
                    #     embedding_type, model, metric = parts
                    #     if model == 'torch_mlp' and metric in ["MRR", "Hits@1", "Hits@3", "Hits@10"]:
                    #         data.append({
                    #             'base_ontology': config.get('base_ontology', 'unknown'),
                    #             'embedding_type': embedding_type,
                    #             'model': model,
                    #             'metric': metric,
                    #             'value': value
                    #         })
                    if len(parts) == 4:
                        base_ontology, embedding_type, model, metric = parts
                        if model == 'torch_mlp' and metric in ["MRR", "Hits@1", "Hits@3", "Hits@10"]:
                            data.append({
                                'base_ontology': config.get('base_ontology', 'unknown'),
                                'embedding_type': embedding_type,
                                'model': model,
                                'metric': metric,
                                'value': value
                            })
# Convert to DataFrame
df = pd.DataFrame(data)

# Create summary table
summary_table = df.pivot_table(
    index=['base_ontology', 'embedding_type', 'model'],
    columns='metric',
    values='value',
    aggfunc='last'
).round(3)

# Display the table
print("\nDetailed Results:")
print(summary_table)

KeyError: 'value'

In [5]:
import numpy as np
from numba import njit

randvec = np.random.randn(768)
%timeit np.repeat(np.expand_dims(randvec, 0), 10000, axis=0)
%timeit np.array([randvec] * 10000)


1.73 ms ± 98.6 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
6.07 ms ± 473 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [66]:
import numba
import numpy as np
from numba import typed

# Sample data setup
all_classes = np.random.randint(0, 10000, 10000)

@numba.njit
def filter_classes(all_classes, ancestors):
    """
    Create a boolean mask for filtering classes

    Args:
        all_classes: Array of all class IDs
        ancestors: Array of ancestor class IDs to filter out

    Returns:
        Boolean mask where True indicates class should be kept
    """
    mask = np.ones(len(all_classes), dtype=numba.bool_)

    # Numba-friendly loop for checking membership
    for i in range(len(all_classes)):
        for j in range(len(ancestors)):
            if all_classes[i] == ancestors[j]:
                mask[i] = False
                break
    return mask

def numpy_filter(all_classes, ancestors):
    """
    NumPy version for comparison
    """
    return ~np.isin(all_classes, ancestors)

# Setup test data
sub_id = 100
ancestors = np.random.randint(0, 10000, 100)

# Benchmark
%timeit filter_classes(all_classes, ancestors)
%timeit numpy_filter(all_classes, ancestors)

328 μs ± 2.2 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
43.9 μs ± 2.49 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [49]:

from re import A


valid_classes = np.random.randint(0, 10000, 1000)
gt_id = valid_classes[0]
sorted_indices = np.argsort(valid_classes)[::-1]

@numba.njit()
def calculate_metrics(gt_id, valid_classes, sorted_indices):
    # Find rank of ground truth
    gt_rank = np.where(valid_classes[sorted_indices] == gt_id)[0][0] + 1

    # Calculate hits metrics
    hits1 = gt_id == valid_classes[sorted_indices[0]]
    hits5 = gt_id in valid_classes[sorted_indices[:5]]
    hits10 = gt_id in valid_classes[sorted_indices[:10]]

    return 1.0/gt_rank, hits1, hits5, hits10

def numpy_calculate_metrics(gt_id, valid_classes, sorted_indices):
    # Find rank of ground truth
    gt_rank = np.where(valid_classes[sorted_indices] == gt_id)[0][0] + 1
    return 1.0/gt_rank, gt_id == valid_classes[sorted_indices[0]], gt_id in valid_classes[sorted_indices[:5]], gt_id in valid_classes[sorted_indices[:10]]

@numba.njit()
def calculate_metrics_2(gt_id, valid_classes, sorted_indices):
    # Find position of ground truth in valid_classes
    gt_pos = -1
    for i, idx in enumerate(sorted_indices):
        if valid_classes[idx] == gt_id:
            gt_pos = i
            break

    if gt_pos == -1:
        return 0.0, False, False, False

    # Calculate metrics
    gt_rank = gt_pos + 1
    hits1 = gt_pos == 0
    hits5 = gt_pos < 5
    hits10 = gt_pos < 10

    return 1.0/gt_rank, hits1, hits5, hits10


# %timeit calculate_metrics(gt_id, valid_classes, sorted_indices)
# %timeit calculate_metrics_2(gt_id, valid_classes, sorted_indices)
# %timeit numpy_calculate_metrics(gt_id, valid_classes, sorted_indices)

for cls in valid_classes:
    assert np.allclose(calculate_metrics(cls, valid_classes, sorted_indices), calculate_metrics_2(cls, valid_classes, sorted_indices))
    assert np.allclose(calculate_metrics(cls, valid_classes, sorted_indices), numpy_calculate_metrics(cls, valid_classes, sorted_indices))


AssertionError: 

In [64]:
@numba.njit(parallel=True)
def prep_input(sub_v, all_class_v, input_type):
    if input_type == 'concatenate':
        # Create repeated sub_v array
        repeated_sub = np.empty((len(all_class_v), len(sub_v) * 2))
        for i in range(len(all_class_v)):
            repeated_sub[i, :len(sub_v)] = sub_v
            repeated_sub[i, len(sub_v):] = all_class_v[i]

        return repeated_sub
    else:
        # Create repeated sub_v array and subtract
        repeated_sub = np.empty((len(all_class_v), len(sub_v)))
        for i in range(len(all_class_v)):
            repeated_sub[i] = sub_v
        return repeated_sub - all_class_v

def numpy_prep_input(sub_v, all_class_v, input_type):
    if input_type == 'concatenate':
        return np.concatenate([np.repeat(sub_v[None, :], len(all_class_v), axis=0), all_class_v], axis=1)
    else:
        return np.repeat(sub_v[None, :], len(all_class_v), axis=0) - all_class_v

sub_v = np.random.randn(768)
all_class_v = np.random.randn(40000, 768)
input_type = 'concatenate'

%timeit prep_input(sub_v, all_class_v, input_type)
%timeit numpy_prep_input(sub_v, all_class_v, input_type)

6.21 s ± 138 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
105 ms ± 2.63 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
