In [21]:
from openai import OpenAI
client = OpenAI()

# Pick a sample claim from HF
from datasets import load_dataset
sample = next(iter(load_dataset("mhurhangee/ep-b1-claims-with-cpc", split="train", streaming=True)))
claim_text = sample["first_claim"]

# Embed the claim
emb = client.embeddings.create(
    model="text-embedding-3-small",
    input=claim_text
).data[0].embedding 

len(emb)



1536

In [22]:

from utils.env_loader import load_project_env
import os
from neo4j import GraphDatabase

load_project_env()

URI = "neo4j://localhost:7687"
AUTH = ("neo4j", os.getenv("NEO4J_PASSWORD"))
SCHEME_ID = "CPC"

driver = GraphDatabase.driver(URI, auth=AUTH)

In [24]:
K = 10
with driver.session() as s:
    results = s.run("""
    CALL gds.knn.stream(
      'CPC',
      {
        nodeProperties: ['fullTitleEmbedding'],
        topK: $k,
        queryVector: $embedding,
        similarityFunction: 'COSINE'
      }
    )
    YIELD node1, score
    RETURN gds.util.asNode(node1).conceptId AS conceptId,
           gds.util.asNode(node1).title AS title,
           score
    ORDER BY score DESC
    """, embedding=emb, k=K)
    hits = results.data()

for h in hits:
    print(f"{h['conceptId']}: {h['title']} (score={h['score']:.4f})")


CypherSyntaxError: {code: Neo.ClientError.Statement.SyntaxError} {message: Unknown procedure output: `score` (line 11, column 18 (offset: 214))
"    YIELD node1, score"
                  ^}

In [26]:
from datasets import load_dataset
from openai import OpenAI

client = OpenAI()

# 1) Grab a sample claim
sample = next(iter(load_dataset("mhurhangee/ep-b1-claims-with-cpc", split="train", streaming=True)))
claim_text = sample["first_claim"]
print(claim_text)
truth = set(sample["cpc"] or [])

# 2) Embed the claim
qvec = client.embeddings.create(
    model="text-embedding-3-small",
    input=claim_text
).data[0].embedding

# 3) Cosine against ALL CPC nodes using GDS function (works in Community)
K = 15
with driver.session() as s:
    rows = s.run("""
    MATCH (c:Concept)
    WHERE c.fullTitleEmbedding IS NOT NULL
    WITH c, gds.similarity.cosine(c.fullTitleEmbedding, $qvec) AS score
    ORDER BY score DESC
    RETURN c.conceptId AS conceptId, c.title AS title, score
    LIMIT $k
    """, qvec=qvec, k=K).data()

print("Ground truth CPCs:", sorted(truth))
print("\nTop matches:")
for r in rows:
    hit = "✓" if r["conceptId"] in truth else "X"
    print(f"{hit} {r['conceptId']}: (score={r['score']:.4f}) {r['title']}  ")


An article, comprising: a substrate; and a functional polymeric phase change material (FP-PCM) chemically bound to the substrate, the article having at least one phase change temperature in the range between - 10° C and 100° C and a phase change enthalpy of at least 2.0 Joules per gram; and wherein the FP-PCM is blended with a binder containing a set of microcapsules that are dispersed in the binder, wherein the binder serves as a matrix within which the FP-PCM and the microcapsules are dispersed, the microcapsules comprising an additional PCM encapsulated within the microcapsules, wherein the FP-PCM comprises at least one crystallizable section, a backbone chain, and side chains, wherein the side chains form the crystallizable section, wherein the FP-PCM carries at least one reactive function, the reactive function being placed on any part of the FP-PCM, especially on a side chain, along the backbone chain or on at least one of the ends of the backbone chain or side chain, and wherein

In [44]:
from datasets import load_dataset
from openai import OpenAI
from neo4j import GraphDatabase
import random


from utils.env_loader import load_project_env
import os
from neo4j import GraphDatabase

load_project_env()

URI = "neo4j://localhost:7687"
AUTH = ("neo4j", os.getenv("NEO4J_PASSWORD"))
SCHEME_ID = "CPC"

driver = GraphDatabase.driver(URI, auth=AUTH)

client = OpenAI()

# 1) Grab a sample claim
# Load streaming dataset
ds = load_dataset("mhurhangee/ep-b1-claims-with-cpc", split="train", streaming=True)

# Pick a random offset (you can change max range for speed)
max_offset = 50000  # adjust depending on dataset size
skip_n = random.randint(0, max_offset)

# Advance the iterator skip_n steps
it = iter(ds)
for _ in range(skip_n):
    next(it)

# Grab the sample
sample = next(it)
claim_text = sample["first_claim"]
truth = set(sample["cpc"] or [])

# 2) Embed the claim
qvec = client.embeddings.create(
    model="text-embedding-3-small",
    input=claim_text
).data[0].embedding

# 3) Vector index KNN query
K = 10
with driver.session() as s:
    rows = s.run("""
    CALL db.index.vector.queryNodes('concept_fulltitle_embedding_idx', $k, $qvec)
    YIELD node, score
    RETURN node.conceptId AS conceptId, node.title AS title, score
    ORDER BY score DESC
    LIMIT $k
    """, k=K, qvec=qvec).data()


print("Claim:", claim_text)
print("Ground truth CPCs:", sorted(truth))
print("\nTop matches:")
for r in rows:
    replaced = r["conceptId"].replace("-", "/")
    hit = "✓" if replaced in truth else "X"
    print(f"{hit} {replaced}: (score={r['score']:.4f}) {r['title']}")


Claim: A computer-implemented method of processing image analysis data derived from an image of a biological sample stained for presence of at least one lymphocyte biomarker, the method comprising: (a) detecting lymphocytes at least in part by segmenting lymphocyte nuclei from other objects depicted in the image of the biological sample; (b) computing a foreground segmentation mask based on the lymphocytes detected within the image; (c) identifying outlines of the detected lymphocytes in the image by filtering the image with the computed foreground segmentation mask; (d) deriving a shape metric for each detected lymphocyte based on the identified lymphocyte outlines; characterized in that the method further comprises: (e) associating at least the derived shape metric with corresponding location information for each detected lymphocyte; and (f) determining, based on the derived shape metric and the corresponding location information, that a particular lymphocyte of the detected lymphocy