In [None]:
import pandas as pd

abstracts = pd.read_csv('../../data/abstracts.txt', sep=r'\|\-\-\|', header=None, index_col=False, engine='python')
abstracts.columns = ['paper_id', 'abstract']

abstracts['abstract'] = abstracts['abstract'].fillna('')
abstracts['abstract_length'] = abstracts['abstract'].apply(lambda x: len(x.split()))

#generate bert embeddings
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm

# Load pre-trained model tokenizer (vocabulary)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
# Load pre-trained model
model = AutoModel.from_pretrained('bert-base-uncased')
# Put the model in "evaluation" mode, meaning feed-forward
model.eval()
# Tokenize input
def get_bert_embedding(text):
    # Encode text
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    # Forward pass, get hidden states
    with torch.no_grad():
        outputs = model(**inputs)
    # Only take the output embeddings from the last layer
    last_hidden_states = outputs.last_hidden_state
    # Take the mean of the last layer hidden states
    embedding = last_hidden_states.mean(dim=1).squeeze().numpy()
    return embedding


# Generate embeddings for all abstracts
embeddings = []
for i in tqdm(range(len(abstracts)), desc="Generating BERT embeddings"):
    embedding = get_bert_embedding(abstracts['abstract'][i])
    embeddings.append(embedding)
# Convert to DataFrame
embeddings_df = pd.DataFrame(embeddings)
embeddings_df['paper_id'] = abstracts['paper_id']
# Save to CSV
embeddings_df.to_csv('../../data/abstracts_bert_embeddings.csv', index=False)