In [1]:
import os
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import torch
from neurovlm.data import get_data_dir
from neurovlm.models import Specter

# Text Encoding

This notebook encodes (title, abstract) pairs using Specter. Specter was trained on scientific (title, abstract) pairs, suggesting it is likely to perform well with medium length queries. MiniLM-L6 is expected to better handle short form queries. 


Use specter is used to to encode (title, abstract) pairs to a 768 dimensional space.

> A. Singh, M. D'Arcy, A. Cohan, D. Downey, and S. Feldman, “SciRepEval: A Multi-Format Benchmark for Scientific Document Representations,” in Proc. Conf. Empirical Methods in Natural Language Processing (EMNLP), 2022. [Online]. Available: https://api.semanticscholar.org/CorpusID:254018137


In [2]:
# Load publications dataframe
data_dir = get_data_dir()
df_pubs = pd.read_parquet(data_dir / "publications_more.parquet")

# Check existing
batch_size = 4
overwrite = False
suffix = ""

if not overwrite and (data_dir / "specter").exists():
    # Append to existing results
    # Over development, more papers were added and this saved compute
    import datetime
    import hashlib

    latent_text_adhoc_exist, pmids_text_adhoc_exist = torch.load(data_dir / "latent_specter2_adhoc.pt", weights_only=False).values()
    latent_text_neuro_exist, pmids_text_neuro_exist = torch.load(data_dir / "latent_specter2_neuro.pt", weights_only=False).values()
    assert (pmids_text_neuro_exist == pmids_text_adhoc_exist).all()

    mask = ~df_pubs["pmid"].isin(pmids_text_neuro_exist)
    df_pubs = df_pubs[mask]
    suffix = "_" + hashlib.sha256(
        datetime.datetime.now().isoformat().encode("utf-8")
    ).hexdigest()[:8] # unique identifier

# Load specter
specter_neuro = Specter("allenai/specter2", adapter="Jerjes/neuro-specter2-multi-pool")
specter_adhoc = Specter("allenai/specter2_aug2023refresh", adapter="adhoc_query")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
There are adapters available but none are activated for the forward pass.


In [6]:
# Encode text in batches
os.makedirs(data_dir / "specter", exist_ok=True)

papers = [title + "[SEP]" + abstract
          for title, abstract in zip(df_pubs['name'], df_pubs['description'])]

batch_size = 4

for i in tqdm(range(0, len(papers), batch_size), total=len(papers)//batch_size):

    with torch.no_grad():
        latent_specter_neuro = specter_neuro(papers[i:i+batch_size])
        latent_specter_adhoc = specter_adhoc(papers[i:i+batch_size])

    torch.save(
        {"embeddings": latent_specter_neuro, "pmid": df_pubs["pmid"].values[i:i+batch_size]},
        data_dir / "specter" / f"encoded_text_specter2_neuro_{str(i).zfill(5)}{suffix}.pt",
        pickle_protocol=5
    )

    torch.save(
        {"embeddings": latent_specter_adhoc, "pmid": df_pubs["pmid"].values[i:i+batch_size]},
        data_dir / "specter" / f"encoded_text_specter2_adhoc_{str(i).zfill(5)}{suffix}.pt",
        pickle_protocol=5
    )

  0%|          | 0/628 [00:00<?, ?it/s]

In [7]:
# Load specter
batch_size = 4

# Stack batched vectors
latent_text_neuro = torch.zeros((len(df_pubs), 768), dtype=torch.float32)
latent_text_adhoc = torch.zeros((len(df_pubs), 768), dtype=torch.float32)

pmids_text_neuro = np.zeros(len(df_pubs), dtype=int)
pmids_text_adhoc = np.zeros(len(df_pubs), dtype=int)

for idx in range(0, len(df_pubs), batch_size):

    latent_text_neuro[idx:idx+batch_size] , pmids_text_neuro[idx:idx+batch_size] = torch.load(
        data_dir / "specter" / f"encoded_text_specter2_neuro_{str(idx).zfill(5)}{suffix}.pt", weights_only=False
    ).values()

    latent_text_adhoc[idx:idx+batch_size] , pmids_text_adhoc[idx:idx+batch_size] = torch.load(
        data_dir / "specter" /  f"encoded_text_specter2_adhoc_{str(idx).zfill(5)}{suffix}.pt", weights_only=False
    ).values()

latent_text_adhoc = latent_text_adhoc / torch.norm(latent_text_adhoc, dim=1)[:, None]
latent_text_neuro = latent_text_neuro / torch.norm(latent_text_neuro, dim=1)[:, None]

assert np.all(pmids_text_neuro == pmids_text_adhoc)

In [9]:
if suffix != "":
    # Stack with existing
    latent_text_adhoc = torch.vstack((latent_text_adhoc_exist, latent_text_adhoc))
    latent_text_neuro = torch.vstack((latent_text_neuro_exist, latent_text_neuro))
    pmids_text_adhoc = np.concatenate((pmids_text_adhoc_exist, pmids_text_adhoc))
    pmids_text_neuro = np.concatenate((pmids_text_neuro_exist, pmids_text_neuro))
    assert (pmids_text_adhoc == pmids_text_neuro).all()

In [11]:
# Sort and save
inds = np.argsort(pmids_text_neuro)
latent_text_adhoc = latent_text_adhoc[inds]
latent_text_neuro = latent_text_neuro[inds]
pmids_text_neuro = pmids_text_neuro[inds]
pmids_text_adhoc = pmids_text_adhoc[inds]

In [12]:
torch.save({"latent": latent_text_adhoc, "pmid": pmids_text_adhoc}, data_dir / "latent_specter2_adhoc.pt")
torch.save({"latent": latent_text_neuro, "pmid": pmids_text_neuro}, data_dir / "latent_specter2_neuro.pt")