## Connect DB

In [9]:
import psycopg2
import pandas as pd
import json

conn = psycopg2.connect(
    dbname="scientilla",
    user="postgres",
    password="pwd",
    host="localhost",
    port=5444
)
cur = conn.cursor()


In [10]:
def clean_item(item):
    title = item.get("title") or ""
    abstract = item.get("abstract") or ""
    title = item.get("title") or ""
    abstract = item.get("abstract") or ""
    if abstract:
      text = f"{title}. {abstract}"
    else:
      text = f"{title}."
    return text.strip()

## Testing

#### From db

In [11]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import numpy as np

model = SentenceTransformer("sentence-transformers/allenai-specter")

def embedding_to_pgvector_str(vec):
    """From np.array to string '[v1,v2,...]' for ::vector."""
    return "[" + ",".join(str(float(x)) for x in vec.tolist()) + "]"

def search(query, top_k=5):
    # query embedding
    q_emb = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)[0]
    q_emb_str = embedding_to_pgvector_str(q_emb)

    # SQL query with pgvector cosine distance
    sql = """
    WITH q AS (
        SELECT %s::vector AS emb
    )
    SELECT
        ri.id,
        ri.data,
        ri.embedding <=> q.emb AS distance
    FROM research_item AS ri
    JOIN research_item_type AS rit
        ON ri.research_item_type_id = rit.id
    JOIN q
        ON TRUE
    WHERE rit.type = 'publication'
      AND ri.embedding IS NOT NULL
    ORDER BY ri.embedding <=> q.emb
    LIMIT %s;
    """

    with conn.cursor() as cur:
        cur.execute(sql, (q_emb_str, top_k))
        rows = cur.fetchall()

    print(f"\n=== Query: {query!r} ===")

    for i, (doc_id, data_json, distance) in enumerate(rows, start=1):
        # Convert distance to similarity score
        score = 1.0 - float(distance)

        text = clean_item(data_json)
        preview = text[:200] + ("..." if len(text) > 200 else "")

        print(f"\n[{i}] id={doc_id}  score={score:.3f}")
        print(f"     text: {preview}")


In [12]:
test_queries = ['renewable energies','heart', 'cardiology', 'nature','biology']

for q in test_queries:
    search(q, top_k=5)


=== Query: 'renewable energies' ===

[1] id=2465  score=0.753
     text: Cheap and easily processable electrode/electrolytes for next-generation sodium-ion batteries. Electrochemical energy storage is of increasing importance to allow large-scale integration of intermitten...

[2] id=2596  score=0.745
     text: Charger-mediated energy transfer for quantum batteries: An open-system approach. The energy charging of a quantum battery is analyzed in an open quantum setting, where the interaction between the batt...

[3] id=2093  score=0.723
     text: A new solvothermal approach to obtain nanoparticles in the Cu3SnS4â€“Cu2FeSnS4 join. In the field of the renewables, a large effort has been devoted in the last years to obtain conventional and new mate...

[4] id=2454  score=0.719
     text: Bio-oil from pyrolysis of wood pellets using a microwave multimode oven and different microwave absorbers. Wood pellets were pyrolyzed using a microwave oven and different microwave power, apparatus s.