In [1]:
import requests
import xmltodict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from dotenv import load_dotenv
import os
import torch
import time
from tqdm import tqdm

In [2]:
print("Number of GPU: ", torch.cuda.device_count())
print("GPU Name: ", torch.cuda.get_device_name())

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Number of GPU:  1
GPU Name:  NVIDIA GeForce RTX 4060
Using device: cuda


In [3]:
from transformers import AutoTokenizer
from adapters import AutoAdapterModel
import torch  # Required for handling tensors

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_base')
model = AutoAdapterModel.from_pretrained('allenai/specter2_base')

# Load the adapter for Specter2
model.load_adapter("allenai/specter2", source="hf", load_as="specter2", set_active=True)

  from .autonotebook import tqdm as notebook_tqdm
BertAdapterModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Fetching 4 files: 100%|██████████| 4/4 [00:00<?, ?it/s]
  state_dict = torch.load(weights_file, map_location="cpu")


'specter2'

In [4]:
df = pd.read_json('Papers/all_papers.json', orient='records', lines=True)
len(df)

7169

In [5]:
from transformers import AutoTokenizer
import torch


# Function to handle long texts with chunking
def embed_long_text(title, abstract, tokenizer, model, max_length=512):
    chunk_size = max_length - 10 
    chunks = [abstract[i:i + chunk_size] for i in range(0, len(abstract), chunk_size)]

    # Create tokenized inputs for each chunk
    embeddings = []
    for chunk in chunks:
        text = title + tokenizer.sep_token + chunk
        inputs = tokenizer(
            text,
            padding=True,
            truncation=True,
            return_tensors="pt",
            return_token_type_ids=False,
            max_length=max_length
        )
        with torch.no_grad():  # Avoid gradient calculations for inference
            output = model(**inputs)
        # Take the first token in the chunk as the embedding
        embeddings.append(output.last_hidden_state[:, 0, :])

    # Aggregate embeddings (e.g., average)
    aggregated_embedding = torch.mean(torch.stack(embeddings), dim=0)
    return aggregated_embedding

batches = 50
batch_embeddings = []

embedding_file = "Data/embeddings.pt"
if not os.path.exists(embedding_file) or os.path.getsize(embedding_file) == 0:
    torch.save([], embedding_file)  # Save an empty list initially

for idx, paper in tqdm(df.iterrows(), desc='Embedding Papers'):
    embedding = embed_long_text(
        paper['title'],
        paper['summary'],
        tokenizer,
        model
    )
    batch_embeddings.append(embedding)
    if len(batch_embeddings) == batches:
        existing_embeddings = torch.load(embedding_file)
        existing_embeddings.extend(batch_embeddings)
        torch.save(existing_embeddings, embedding_file)
        batch_embeddings = []

# Save any remaining embeddings in the batch
if batch_embeddings:
    existing_embeddings = torch.load(embedding_file)
    existing_embeddings.extend(batch_embeddings)
    torch.save(existing_embeddings, embedding_file)
    print(f"Final total embeddings saved: {len(existing_embeddings)}")

  existing_embeddings = torch.load(embedding_file)
Embedding Papers: 7169it [30:00,  3.98it/s]
  existing_embeddings = torch.load(embedding_file)


Final total embeddings saved: 9833
