In [1]:
import pandas as pd
from sklearn.metrics.pairwise import euclidean_distances
from tqdm.notebook import tqdm
import numpy as np

negative_edges = pd.read_csv('../../data/training/negative/non_existing.csv')
bert_embeddings = pd.read_csv('../../data/abstracts_bert_embeddings.csv')

# Ensure all IDs are strings and strip whitespace
bert_embeddings['paper_id'] = bert_embeddings['paper_id'].astype(str).str.strip()
negative_edges['source'] = negative_edges['source'].astype(str).str.strip()
negative_edges['target'] = negative_edges['target'].astype(str).str.strip()

negative_edges_euclidean_distance_df = pd.DataFrame(columns=['source', 'target', 'euclidean_distance'])
negative_edges_euclidean_distance_df['source'] = negative_edges['source']
negative_edges_euclidean_distance_df['target'] = negative_edges['target']

# Build a mapping from paper_id to embedding vector (as numpy array)
embedding_dict = bert_embeddings.set_index('paper_id').iloc[:, 0:].to_dict(orient='index')

def get_vec(pid):
    # Remove any non-numeric columns (like paper_id) if present
    row = embedding_dict.get(pid)
    if row is None:
        return None
    # If row is a dict, get only numeric values
    vec = [v for k, v in row.items() if isinstance(v, (float, int))]
    return np.array(vec, dtype=np.float32)


def fast_euclidean(source, target):
    v1 = get_vec(source)
    v2 = get_vec(target)
    if v1 is not None and v2 is not None:
        return np.linalg.norm(v1 - v2)
    else:
        return np.nan

negative_edges_euclidean_distance_df['euclidean_distance'] = [
    fast_euclidean(src, tgt)
    for src, tgt in tqdm(zip(negative_edges_euclidean_distance_df['source'], negative_edges_euclidean_distance_df['target']),
                        total=len(negative_edges_euclidean_distance_df))
]

negative_edges_euclidean_distance_df.to_csv('../../data/training/negative/negative_edges_euclidean_distance.csv', index=False)

  0%|          | 0/1091861 [00:00<?, ?it/s]