In [28]:
import psycopg2
import pandas as pd
import json
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from adapters import AutoAdapterModel
import torch.nn.functional as F
import re

### Connect to DB and load adapter for the query

In [29]:
conn = psycopg2.connect(
    dbname="scientilla",
    user="postgres",
    password="pwd",
    host="localhost",
    port=5444,
)
cur = conn.cursor()

tokenizer = AutoTokenizer.from_pretrained("allenai/specter2_base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
doc_model = AutoAdapterModel.from_pretrained("allenai/specter2_base")

doc_model.load_adapter(
    "allenai/specter2_adhoc_query",
    source="hf",
    load_as="query",
    set_active=True,
)

Fetching 4 files: 100%|██████████| 4/4 [00:00<00:00, 27103.74it/s]
There are adapters available but none are activated for the forward pass.


'query'

In [None]:

def embed_input(model, text_batch):
    """Calculate the embeddings for a batch of texts.
    Args:
        model: Transformer model with SPECTER2 adapter.
        text_batch: List of input strings.
    Returns:
        np.array of normalized embeddings.
     """
    inputs = tokenizer(
        text_batch,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt",
        return_token_type_ids=False,
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        output = model(**inputs)
        embeddings = output.last_hidden_state[:, 0, :]
        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)

    return embeddings.cpu().numpy()

def embedding_to_pgvector_str(vec):
    """From np.array to string '[v1,v2,...]' for column ::vector."""
    return "[" + ",".join(str(float(x)) for x in vec.tolist()) + "]"

# DB update functions
def update_embedding(record_id, embedding):
    vector_str = embedding_to_pgvector_str(embedding)
    cur.execute(
        """
        UPDATE research_item
        SET embedding_specter2 = %s
        WHERE id = %s
        """,
        (vector_str, record_id),
    )

def set_null_embedding(record_id):
    cur.execute(
        """
        UPDATE research_item
        SET embedding_specter2 = NULL
        WHERE id = %s
        """,
        (record_id,),
    )

## Testing

In [None]:
def clean_item(item):
    if isinstance(item, str):
        item = json.loads(item)

    title = (item.get("title") or "").strip()
    abstract = (item.get("abstract") or "").strip()

    if abstract:
        text = f"{title}. {abstract}"
    else:
        text = title

    return text.strip()


def search_specter2(query, top_k=5):
    
    # query embedding
    q_emb = embed_input(doc_model, [query])[0]  # np.array (768,)
    q_emb_str = embedding_to_pgvector_str(q_emb)

    sql = """
    WITH q AS (
        SELECT %s::vector AS emb
    )
    SELECT
        ri.id,
        ri.data,
        ri.embedding_specter2 <=> q.emb AS distance
    FROM research_item AS ri
    JOIN research_item_type AS rit
        ON ri.research_item_type_id = rit.id
    JOIN q
        ON TRUE
    WHERE rit.type = 'publication'
      AND ri.embedding_specter2 IS NOT NULL
    ORDER BY ri.embedding_specter2 <=> q.emb
    LIMIT %s;
    """

    with conn.cursor() as cur:
        # cur.execute("""SET hnsw.ef_search = 1000;""")
        cur.execute(sql, (q_emb_str, top_k))
        rows = cur.fetchall()


    print(f"\n=== Query: {query!r} ===")
    for i, (doc_id, data_json, distance) in enumerate(rows, start=1):
        
        score = 1.0 - float(distance)

        text = clean_item(data_json)
        preview = text[:200] + ("..." if len(text) > 200 else "")

        print(f"\n[{i}] id={doc_id}  score={score:.3f}")
        print(f"     text: {preview}")

In [None]:
test_queries = ['contrastive learning for cross-domain video recognition']

for q in test_queries:
    search_specter2(q, top_k=50)


=== Query: 'contrastive learning for cross-domain video recognition' ===

[1] id=7797  score=0.802
     text: Dual-Head Contrastive Domain Adaptation for Video Action Recognition. Unsupervised domain adaptation (UDA) methods have become very popular in computer vision. However, while several techniques have b...

[2] id=22748  score=0.796
     text: Contrastive Learning for Cross-Domain Open World Recognition. The ability to evolve is fundamental for any valuable autonomous agent whose knowledge cannot remain limited to that injected by the manuf...

[3] id=9855  score=0.784
     text: Boosting binary masks for multi-domain learning through affine transformations. In this work, we present a new, algorithm for multi-domain learning. Given a pretrained architecture and a set of visual...

[4] id=28484  score=0.780
     text: MultiDIAL: Domain Alignment Layers for (Multisource) Unsupervised Domain Adaptation. One of the main challenges for developing visual recognition systems working in