## Load Dataset

In [None]:
import psycopg2
import pandas as pd
import json

conn = psycopg2.connect(
    dbname="scientilla",
    user="postgres",
    password="pwd",
    host="localhost",
    port=5444
)

query = """
SELECT ri.data
FROM research_item AS ri
JOIN research_item_type AS rit
    ON ri.research_item_type_id = rit.id
WHERE rit.type = 'publication'
LIMIT 100;
"""

df = pd.read_sql(query, conn)


data_array = df["data"].tolist()[:50]


  df = pd.read_sql(query, conn)


DatabaseError: Execution failed on sql '
SELECT ri.data
FROM research_item AS ri
JOIN research_item_type AS rit
    ON ri.research_item_type_id = rit.id
WHERE rit.type = 'Publication'
LIMIT 50;
': invalid input value for enum enum_research_item_type_type: "Publication"
LINE 6: WHERE rit.type = 'Publication'
                         ^


## Preprocessing

In [None]:
def clean_item(item):
    year = item.get("year") or ""
    title = item.get("title") or ""
    abstract = item.get("abstract") or ""

    text = f"{title}. {abstract}. {year}"
    return text.strip()

cleaned_strings = [clean_item(d) for d in data_array]

for s in cleaned_strings:
    print(s)

The “embreathment” illusion highlights the role of breathing in corporeal awareness. © 2020 American Physiological Society. All rights reserved.Recent theories posit that physiological signals contribute to corporeal awareness, the basic feeling that one has a body (body ownership) that acts according to one's will (body agency) and occupies a specific position (body location). Combining physiological recordings with immersive virtual reality, we found that an ecological mapping of real respiratory patterns onto a virtual body illusorily changes corporeal awareness. This new way of inducing a respiratory bodily illusion, called “embreathment,” revealed that breathing is almost as important as visual appearance for inducing body ownership and more important than any other cue for body agency. These effects were moderated by individual levels of interoception, as assessed through a standard heartbeat-counting task and a new “pneumoception” task. By showing that respiratory, visual, and s

## Dataset Embedding Generation

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import numpy as np

model = SentenceTransformer("gsarti/scibert-nli")

doc_texts = [json.dumps(d, ensure_ascii=False) for d in data_array]
doc_ids = [d.get("doi") or f"idx_{i}" for i, d in enumerate(data_array)]

doc_embeddings = model.encode(
    doc_texts,
    convert_to_numpy=True,
    normalize_embeddings=True
)

np.save("embeddings/doc_embeddings.npy", doc_embeddings)

with open("doc_ids.json", "w", encoding="utf-8") as f:
    json.dump(doc_ids, f, ensure_ascii=False, indent=2)

print("Saved doc_embeddings.npy and doc_ids.json")


  from .autonotebook import tqdm as notebook_tqdm
No sentence-transformers model found with name gsarti/scibert-nli. Creating a new one with mean pooling.


Saved doc_embeddings.npy and doc_ids.json


## Load Embeddings

In [None]:
import numpy as np
import json
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

model = SentenceTransformer("gsarti/scibert-nli")

doc_embeddings = np.load("embeddings/doc_embeddings.npy")

with open("doc_ids.json", "r", encoding="utf-8") as f:
    doc_ids = json.load(f)


No sentence-transformers model found with name gsarti/scibert-nli. Creating a new one with mean pooling.


## Testing

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def search(query, top_k=5):
    q_emb = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)
    sims = cosine_similarity(q_emb, doc_embeddings)[0]

    ranked = np.argsort(-sims)
    k = min(top_k, len(ranked))

    print(f"\n=== Query: {query!r} ===")
    for i in range(k):
        idx = ranked[i]
        score = sims[idx]
        doc_id = doc_ids[idx]
        text = cleaned_strings[idx]

        print(f"\n[{i+1}] id={doc_id}  score={score:.3f}")
        print(f"     text: {text[:200]}{'...' if len(text) > 200 else ''}")


In [None]:
test_queries = ['heart']

for q in test_queries:
    search(q, top_k=5)


=== Query: 'heart' ===

[1] id=10.1152/JN.00617.2019  score=0.215
     text: The “embreathment” illusion highlights the role of breathing in corporeal awareness. © 2020 American Physiological Society. All rights reserved.Recent theories posit that physiological signals contrib...

[2] id=idx_23  score=0.188
     text: Multi-task and transfer learning. . 2017

[3] id=idx_48  score=0.187
     text: Mathematical Methods for Data Analysis. . 2016

[4] id=idx_49  score=0.182
     text: Esseri Umani e Robot Umanoidi. . 2017

[5] id=idx_10  score=0.178
     text: NA. . 2017
