In [1]:
import pandas as pd
from sklearn.metrics.pairwise import euclidean_distances
from tqdm.notebook import tqdm
import numpy as np

predict_edges = pd.read_csv('../../data/test.txt', sep=',', header=None)
predict_edges.columns = ['source', 'target']
bert_embeddings = pd.read_csv('../../data/abstracts_bert_embeddings.csv')

# Ensure all IDs are strings and strip whitespace
bert_embeddings['paper_id'] = bert_embeddings['paper_id'].astype(str).str.strip()
predict_edges['source'] = predict_edges['source'].astype(str).str.strip()
predict_edges['target'] = predict_edges['target'].astype(str).str.strip()
# Merge embeddings for source and target
source_emb = bert_embeddings.rename(columns={'paper_id': 'source'})
target_emb = bert_embeddings.rename(columns={'paper_id': 'target'})

# Merge to get source and target embeddings
merged = predict_edges.merge(source_emb, on='source').merge(target_emb, on='target', suffixes=('_src', '_tgt'))

# Get only embedding columns (exclude 'source' and 'target')
embedding_cols = [col for col in bert_embeddings.columns if col != 'paper_id']

# Compute dot product row-wise
dot_products = np.einsum('ij,ij->i', merged[[f"{col}_src" for col in embedding_cols]].values,
                                   merged[[f"{col}_tgt" for col in embedding_cols]].values)

predict_edges['dot_product'] = dot_products
predict_edges.head()


# Compute dot product row-wise
dot_products = np.einsum('ij,ij->i', merged[[f"{col}_src" for col in embedding_cols]].values,
                                   merged[[f"{col}_tgt" for col in embedding_cols]].values)

predict_edges['dot_product'] = dot_products
predict_edges.head()

predict_edges.to_csv('../../data/training/predict/predict_edges_dot_product.csv', index=False)
