## Connect DB

In [1]:
import psycopg2
import pandas as pd
import json

conn = psycopg2.connect(
    dbname="scientilla",
    user="postgres",
    password="pwd",
    host="localhost",
    port=5444
)
cur = conn.cursor()


In [2]:
def clean_item(item):
    title = item.get("title") or ""
    abstract = item.get("abstract") or ""
    title = item.get("title") or ""
    abstract = item.get("abstract") or ""
    if abstract:
      text = f"{title}. {abstract}"
    else:
      text = f"{title}."
    return text.strip()

## Testing

#### From db

In [3]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import numpy as np

model = SentenceTransformer("sentence-transformers/allenai-specter")

def embedding_to_pgvector_str(vec):
    """From np.array to string '[v1,v2,...]' for ::vector."""
    return "[" + ",".join(str(float(x)) for x in vec.tolist()) + "]"

def search(query, top_k=5):
    # query embedding
    q_emb = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)[0]
    q_emb_str = embedding_to_pgvector_str(q_emb)

    # SQL query with pgvector cosine distance
    sql = """
    WITH q AS (
        SELECT %s::vector AS emb
    )
    SELECT
        ri.id,
        ri.data,
        ri.embedding <=> q.emb AS distance
    FROM research_item AS ri
    JOIN research_item_type AS rit
        ON ri.research_item_type_id = rit.id
    JOIN q
        ON TRUE
    WHERE rit.type = 'publication'
      AND ri.embedding IS NOT NULL
    ORDER BY ri.embedding <=> q.emb
    LIMIT %s;
    """

    with conn.cursor() as cur:
        cur.execute(sql, (q_emb_str, top_k))
        rows = cur.fetchall()

    print(f"\n=== Query: {query!r} ===")

    for i, (doc_id, data_json, distance) in enumerate(rows, start=1):
        # Convert distance to similarity score
        score = 1.0 - float(distance)

        text = clean_item(data_json)
        preview = text[:200] + ("..." if len(text) > 200 else "")

        print(f"\n[{i}] id={doc_id}  score={score:.3f}")
        print(f"     text: {preview}")


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
test_queries = ['altering training examples','friendly training','modify training data','model allowed to simplify hard examples']

for q in test_queries:
    search(q, top_k=10)


=== Query: 'altering training examples' ===

[1] id=2286  score=0.825
     text: Friendly Training: Neural Networks Can Adapt Data to Make Learning Easier. In the last decade, motivated by the success of Deep Learning, the scientific community proposed several approaches to make t...

[2] id=2252  score=0.791
     text: Continual Learning with Pretrained Backbones by Tuning in the Input Space. The intrinsic difficulty in adapting deep learning models to non-stationary environments limits the applicability of neural n...

[3] id=1946  score=0.771
     text: Incorporating Rivalry in Reinforcement Learning for a Competitive Game.

[4] id=2509  score=0.767
     text: Learning sets with separating kernels. We consider the problem of learning a set from random samples. We show how relevant geometric and topological properties of a set can be studied analytically usi...

[5] id=2368  score=0.766
     text: Guest Editorial Special Section on Learning in Non-(geo)metric Spaces. Traditional mac