In [2]:
from tqdm.notebook import tqdm
import re
from math import ceil
import numpy as np
import pandas as pd
import torch
from sklearn.feature_extraction.text import CountVectorizer

from neurovlm.retrieval_resources import (
    _load_dataframe, _load_specter, _load_latent_text
)
from neurovlm.data import data_dir

# Corpus Extraction
Extract n-grams for the training corpus. N-grams are weighted by cosine similarity to article embeddings, e.g. if n-gram is highly similar to the articles it gets a value near 1, otherwise it gets a value near 0.

In [16]:
def extract_ngrams(docs, ngram_range):
    counts = CountVectorizer(
        ngram_range=ngram_range,
        stop_words="english",
        min_df=1
    ).fit(docs)

    X = counts.transform(docs)  # shape: (n_docs, n_features)

    feature_names = counts.get_feature_names_out()

    mask = np.array(X.sum(axis=0) >= 100)[0]

    X = np.array(X[:, mask].todense())
    feature_names = feature_names[mask]

    return X, feature_names

# manual cleaning
DROP_SUBSTRINGS = [
    # study-like language
    "study", "studies", "result", "indicate", "show", "related",
    "differences", "significant", "effect", "role", "measure",
    "displayed", "involved", "examined", "associated", "altered",
    "performed", "demonstrated", "conclus", "correlate", "individuals",
    "common", "prior",
    # too general
    "brain", "neural", "neuroimaging", "mri", "fmri", "connectivity",
    "diagnosed", "patients", "little", "known", "activation", "blood",
    "alterations", "neuroscience", "people",
]

DROP_REGEXES = [
    r"^cortex",
    # general single terms
    r"^ventral$", r"^frontal$", r"^neuronal$", r"^cognitive$",
    r"^cerebral$", r"^resting_state$",  r"^disorder$",
    r"^neuropsychological$", r"^cognition$", r"^stimulus$",
    r"^dysfunction$", r"^imaging$", r"^functional$",
    r"^functional imaging$", r"^task performance$", r"^impairments$",
    r"^traits$", r"^dysfunction$",  r"^cognitive abilities$", r"^imaging dti$",
    # [SEP] token
    r"\bsep\b",
]

pattern = "|".join(
    [re.escape(s) for s in DROP_SUBSTRINGS] +  # plain terms
    DROP_REGEXES                               # regex terms
)

In [None]:
# load text
df = _load_dataframe()
text = df["name"] + " [SEP] " + df["description"]

# extract n-grams
if not (data_dir / "ngram_matrix.npy").exists():
    X_uni, features_uni = extract_ngrams(text, (1, 1))
    X_bi, features_bi = extract_ngrams(text, (2, 2))
    X_tri, features_tri = extract_ngrams(text, (3, 3))
    X = np.hstack((X_uni, X_bi, X_tri))
    features = np.concat((features_uni, features_bi, features_tri))
    
    mask = ~pd.Series(features).str.contains(pattern, case=False, na=False, regex=True).to_numpy()
    features = features[mask]
    X = X[:, mask]
    X = X[df["pmid"].argsort().to_numpy()]
    np.save(data_dir / "ngram_matrix.npy", X)
    np.save(data_dir / "ngram_labels.npy", features.astype(str))
    np.save(data_dir / "ngram_mask.npy", mask)
else:
    # load pre-computed
    X = np.load(data_dir / "ngram_matrix.npy")
    features = np.load(data_dir / "ngram_labels.npy")

# load latent text
latent, pmids = _load_latent_text()

In [None]:
# specter embeddings for ngrams
specter = _load_specter()
specter.specter = specter.specter.eval()

if not (data_dir / "ngram_emb.pt").exists():
    ngram_emb = []
    batch_size = 512
    for i in tqdm(range(0, len(features), batch_size), total=ceil(len(features)//batch_size)):
        with torch.no_grad():
            ngram_emb.append(specter(features[i:i+batch_size].tolist()))
    ngram_emb = torch.vstack(ngram_emb)
    ngram_emb = ngram_emb / ngram_emb.norm(dim=1)[:, None] # unit vector
    torch.save(ngram_emb, data_dir / "ngram_emb.pt")

There are adapters available but none are activated for the forward pass.
