In [22]:
import psycopg2
import pandas as pd
import json
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from adapters import AutoAdapterModel
import torch.nn.functional as F
import re

### Load Dataset

In [23]:
conn = psycopg2.connect(
    dbname="scientilla",
    user="postgres",
    password="pwd",
    host="localhost",
    port=5444,
)
cur = conn.cursor()

query = """
SELECT ri.id, ri.data
FROM research_item AS ri
JOIN research_item_type AS rit
    ON ri.research_item_type_id = rit.id
WHERE rit.type IN ('publication', 'patent')
ORDER BY ri.id;
"""

df = pd.read_sql(query, conn)
ids = df["id"].tolist()
data_array = df["data"].tolist()

  df = pd.read_sql(query, conn)


### Preprocessing

In [24]:
PATTERN = re.compile(
    r"^(Editorial|Preface|Erratum|Corrigendum|Introduction|Foreword|Guest Editorial)( to)?:?\s*",
    re.IGNORECASE
)

def clean_title(title: str) -> str:
    t = (title or "").strip()
    t = PATTERN.sub("", t).strip()
    return t

def build_clean_text_and_flags(item, sep_token: str | None = None):
    """
    Returns:
      - text (string for the embedding) or None if to be filtered out
      - has_abstract (bool)
      - title_word_count (int)
    """
    title_raw = item.get("title") or ""
    abstract_raw = item.get("abstract") or ""

    title_clean = clean_title(title_raw)
    title_clean = title_clean.strip()

    title_words = title_clean.split()
    title_word_count = len(title_words)

    abstract_clean = (abstract_raw or "").strip()
    has_abstract = bool(abstract_clean)

    if title_word_count <= 3 and not has_abstract:
        return None, has_abstract, title_word_count

    if has_abstract:
        if sep_token:
            text = f"{title_clean} {sep_token} {abstract_clean}"
        else:
            text = f"{title_clean}. {abstract_clean}"
    else:
        text = f"{title_clean}."

    text = text.strip()
    if not text:
        return None, has_abstract, title_word_count

    return text, has_abstract, title_word_count


In [25]:
tokenizer = AutoTokenizer.from_pretrained("allenai/specter2_base")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

doc_model = AutoAdapterModel.from_pretrained("allenai/specter2_base")

adapter_name = doc_model.load_adapter(
    "allenai/specter2",
    source="hf",
    load_as="proximity",
    set_active=True,
)
print(doc_model.active_adapters)

doc_model.to(device)
doc_model.eval()

Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 59283.45it/s]
There are adapters available but none are activated for the forward pass.


Stack[proximity]


BertAdapterModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31090, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttentionWithAdapters(
              (query): LoRALinearTorch(
                in_features=768, out_features=768, bias=True
                (shared_parameters): ModuleDict()
                (loras): ModuleDict()
              )
              (key): LoRALinearTorch(
                in_features=768, out_features=768, bias=True
                (shared_parameters): ModuleDict()
                (loras): ModuleDict()
              )
              (value): LoRALinearTorch(
             

In [26]:
cleaned_texts = []
filtered_ids = []
has_abstract_flags = []
title_word_counts = []
bad_ids = []

for doc_id, item in zip(ids, data_array):
    text, has_abs, twc = build_clean_text_and_flags(
        item,
        sep_token=tokenizer.sep_token
    )
    if text is None:
        bad_ids.append(doc_id)
        continue

    cleaned_texts.append(text)
    filtered_ids.append(doc_id)
    has_abstract_flags.append(has_abs)
    title_word_counts.append(twc)

print(f"Number of docs before filtering: {len(ids)}")
print(f"Number of docs after filtering (with embeddings): {len(filtered_ids)}")
print(f"Number of docs set to NULL embedding: {len(bad_ids)}")

Number of docs before filtering: 52968
Number of docs after filtering (with embeddings): 52536
Number of docs set to NULL embedding: 432


## Dataset Embedding Generation

In [None]:
def embed_input(model, text_batch):
    """Calculate the embeddings for a batch of texts.
    Args:
        model: Transformer model with SPECTER2 adapter.
        text_batch: List of input strings.
    Returns:
        np.array of normalized embeddings.
     """
    inputs = tokenizer(
        text_batch,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt",
        return_token_type_ids=False,
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        output = model(**inputs)
        embeddings = output.last_hidden_state[:, 0, :]
        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)

    return embeddings.cpu().numpy()

def embedding_to_pgvector_str(vec):
    """From np.array to string '[v1,v2,...]' for column ::vector."""
    return "[" + ",".join(str(float(x)) for x in vec.tolist()) + "]"

def update_embedding(record_id, embedding):
    vector_str = embedding_to_pgvector_str(embedding)
    cur.execute(
        """
        UPDATE research_item
        SET embedding_specter2 = %s
        WHERE id = %s
        """,
        (vector_str, record_id),
    )

def set_null_embedding(record_id):
    cur.execute(
        """
        UPDATE research_item
        SET embedding_specter2 = NULL
        WHERE id = %s
        """,
        (record_id,),
    )

In [28]:
all_embeds = []
batch_size = 16

for i in range(0, len(cleaned_texts), batch_size):
    batch_texts = cleaned_texts[i : i + batch_size]
    batch_embeds = embed_input(doc_model, batch_texts)
    all_embeds.append(batch_embeds)

if all_embeds:
    embeds = np.vstack(all_embeds)
else:
    embeds = np.zeros((0, 768), dtype=np.float32)


#### Save to db

In [29]:
for rec_id, vector in zip(filtered_ids, embeds):
    update_embedding(record_id=rec_id, embedding=vector)

for rec_id in bad_ids:
    set_null_embedding(rec_id)

conn.commit()
cur.close()
conn.close()

## Testing

#### From db

In [30]:
conn = psycopg2.connect(
    dbname="scientilla",
    user="postgres",
    password="pwd",
    host="localhost",
    port=5444,
)
cur = conn.cursor()

tokenizer = AutoTokenizer.from_pretrained("allenai/specter2_base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
doc_model = AutoAdapterModel.from_pretrained("allenai/specter2_base")

doc_model.load_adapter(
    "allenai/specter2_adhoc_query",
    source="hf",
    load_as="query",
    set_active=True,
)


def embedding_to_pgvector_str(vec):
    """From np.array to string '[v1,v2,...]' for ::vector."""
    return "[" + ",".join(str(float(x)) for x in vec.tolist()) + "]"


def clean_item(item):
    if isinstance(item, str):
        item = json.loads(item)

    title = (item.get("title") or "").strip()
    abstract = (item.get("abstract") or "").strip()

    if abstract:
        text = f"{title}. {abstract}"
    else:
        text = title

    return text.strip()


def search_specter2(query, top_k=5):
    
    # query embedding
    q_emb = embed_input(doc_model, [query])[0]  # np.array (768,)
    q_emb_str = embedding_to_pgvector_str(q_emb)

    sql = """
    WITH q AS (
        SELECT %s::vector AS emb
    )
    SELECT
        ri.id,
        ri.data,
        ri.embedding_specter2 <=> q.emb AS distance
    FROM research_item AS ri
    JOIN research_item_type AS rit
        ON ri.research_item_type_id = rit.id
    JOIN q
        ON TRUE
    WHERE rit.type = 'publication'
      AND ri.embedding_specter2 IS NOT NULL
    ORDER BY ri.embedding_specter2 <=> q.emb
    LIMIT %s;
    """

    with conn.cursor() as cur:
        # cur.execute("""SET hnsw.ef_search = 100;""")
        cur.execute(sql, (q_emb_str, top_k))
        rows = cur.fetchall()


    print(f"\n=== Query: {query!r} ===")
    for i, (doc_id, data_json, distance) in enumerate(rows, start=1):
        
        score = 1.0 - float(distance)

        text = clean_item(data_json)
        preview = text[:200] + ("..." if len(text) > 200 else "")

        print(f"\n[{i}] id={doc_id}  score={score:.3f}")
        print(f"     text: {preview}")

Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 81442.80it/s]
There are adapters available but none are activated for the forward pass.


In [31]:
test_queries = ['Multiphoton Microscopy Advances Toward Super Resolution','renewable energies','heart', 'cardiology', 'nature','biology']

for q in test_queries:
    search_specter2(q, top_k=5)


=== Query: 'Multiphoton Microscopy Advances Toward Super Resolution' ===

[1] id=2148  score=0.864
     text: Multiphoton Microscopy Advances Toward Super Resolution

[2] id=17490  score=0.825
     text: The development of microscopy for super-resolution: Confocal microscopy, and image scanning microscopy. Optical methods of super-resolution microscopy, such as confocal microscopy, structured illumina...

[3] id=36633  score=0.815
     text: Near infrared super-resolution microscopy.

[4] id=27261  score=0.815
     text: Super-resolution fluorescence microscopy. Super resolution microscopy circumvents what seemed to be a fundamental limit in optical microscopy. Today we can say that optical microscopy is todaly, in pr...

[5] id=4626  score=0.801
     text: The 2015 super-resolution microscopy roadmap. Far-field optical microscopy using focused light is an important tool in a number of scientific disciplines including chemical, (bio)physical and biomedic...

=== Query: 'renewable ener