In [None]:
import pandas as pd


abstracts = pd.read_csv('data/abstracts.txt', sep=r'\|\-\-\|', header=None, index_col=False, engine='python')
abstracts.columns = ['paper_id', 'abstract']


import torch
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
import numpy as np

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()

# Function to get BERT embedding for a single text
def get_bert_embedding(text):
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=128, padding='max_length')
        outputs = model(**inputs)
        # Use the [CLS] token embedding as the sentence embedding
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
    return cls_embedding

# Generate embeddings for all abstracts
embeddings = []
for abstract in tqdm(abstracts['abstract'], desc="Generating BERT embeddings"):
    emb = get_bert_embedding(str(abstract))
    embeddings.append(emb)

# Convert embeddings to a DataFrame
embeddings_df = pd.DataFrame(embeddings)
embeddings_df['paper_id'] = abstracts['paper_id']

# Save embeddings to a CSV file (optional)
embeddings_df.to_csv('data/bert_abstract_embeddings.csv', index=False)
