In [1]:
import os
from tqdm.notebook import tqdm
from hashlib import sha256
import pandas as pd
import torch
from neurovlm.data import data_dir
from neurovlm.models import Specter

In [2]:
df = pd.read_parquet("/Users/ryanhammonds/Desktop/wiki_and_cogatlas/neurowiki.parquet")
text = list(df["title"] + " [SEP] " + df["summary"])
hash = [sha256(i.encode("utf-8")).hexdigest()[:8] for i in text]
df["id"] = hash

In [3]:
# Encode text in batches
os.makedirs(data_dir / "specter_wiki", exist_ok=True)

specter_adhoc = Specter("allenai/specter2_aug2023refresh", adapter="adhoc_query")

batch_size = 4

for i in tqdm(range(0, len(text), batch_size), total=len(text)//batch_size):

    with torch.no_grad():
        latent_specter = specter_adhoc(text[i:i+batch_size])
        ids = hash[i:i+batch_size]

    torch.save(
        {"embeddings": latent_specter, "id": ids},
        data_dir / "specter_wiki" / f"encoded_text_specter2_adhoc_{str(i).zfill(4)}.pt",
        pickle_protocol=5
    )

There are adapters available but none are activated for the forward pass.


  0%|          | 0/9602 [00:00<?, ?it/s]

In [3]:
specter_adhoc = Specter("allenai/specter2_aug2023refresh", adapter="adhoc_query")
batch_size = 4
latent_specter = torch.zeros((len(text), 768))
ids_specter = []
for i in tqdm(range(0, len(text), batch_size), total=len(text)//batch_size):

    enc = torch.load(data_dir / "specter_wiki" / f"encoded_text_specter2_adhoc_{str(i).zfill(4)}.pt", weights_only=False)

    with torch.no_grad():
        latent_specter[i:i+batch_size] = enc['embeddings']

    ids_specter.extend(enc["id"])

There are adapters available but none are activated for the forward pass.


  0%|          | 0/9602 [00:00<?, ?it/s]

In [4]:
torch.save(
    {"latent": latent_specter, "id": ids_specter},
    data_dir / "latent_specter_wiki.pt"
)

In [5]:
proj_head_mse = torch.load(data_dir / f"proj_head_mse_sparse_adhoc.pt", weights_only=False).to("cpu").eval()
proj_head_infonce = torch.load(data_dir / f"proj_head_text_infonce.pt", weights_only=False).to("cpu").eval()

In [6]:
latent_specter_aligned_mse = torch.zeros((len(text), 384))
latent_specter_aligned_infonce = torch.zeros((len(text), 384))
ids_specter = []
for i in tqdm(range(0, len(text), batch_size), total=len(text)//batch_size):

    enc = torch.load(data_dir / "specter_wiki" / f"encoded_text_specter2_adhoc_{str(i).zfill(4)}.pt", weights_only=False)

    with torch.no_grad():
        latent_specter_aligned_mse[i:i+batch_size] = proj_head_mse(enc['embeddings'])
        latent_specter_aligned_mse[i:i+batch_size] = latent_specter_aligned_mse[i:i+batch_size] / latent_specter_aligned_mse[i:i+batch_size].norm(dim=1)[:, None]

        latent_specter_aligned_infonce[i:i+batch_size] = proj_head_infonce(enc['embeddings'])
        latent_specter_aligned_infonce[i:i+batch_size] = latent_specter_aligned_infonce[i:i+batch_size] / latent_specter_aligned_infonce[i:i+batch_size].norm(dim=1)[:, None]
    ids_specter.extend(enc["id"])

  0%|          | 0/9602 [00:00<?, ?it/s]

In [7]:
torch.save(
    {"latent": latent_specter_aligned_mse, "id": ids_specter},
    data_dir / "latent_specter_wiki_aligned_adhoc_mse.pt"
)

torch.save(
    {"latent": latent_specter_aligned_infonce, "id": ids_specter},
    data_dir / "latent_specter_wiki_aligned_adhoc_infonce.pt"
)

In [8]:
import torch
import pandas as pd
from neurovlm.data import data_dir
df = pd.read_parquet(data_dir / "neurowiki_with_ids.parquet")
wiki_specter = torch.load(data_dir / "latent_specter_wiki_aligned.pt")
latent, ids = wiki_specter["latent"], wiki_specter["id"]
assert df["id"].isin(ids).all() # ids in df match ids in vectors
assert (torch.abs(latent.norm(dim=1) - 1) < 1e-6).all() # some floating point error, row norm should be approx 1

In [9]:
df.to_parquet(data_dir / "neurowiki_with_ids.parquet")