## Setup

In [1]:
!pip uninstall -qqy jupyterlab kfp  # Remove unused conflicting packages
!pip install -qU "google-genai==1.7.0" "chromadb==0.6.3"

import json

from google import genai
from google.genai import types

from IPython.display import Markdown





[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.7/144.7 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.1/611.1 kB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.9/100.9 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m4.1 MB/s[0

In [2]:
# API keys
from kaggle_secrets import UserSecretsClient

GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")
client = genai.Client(api_key=GOOGLE_API_KEY)

for m in client.models.list():
    if "embedContent" in m.supported_actions:
        print(m.name)

models/embedding-001
models/text-embedding-004
models/gemini-embedding-exp-03-07
models/gemini-embedding-exp


We import the arxiv data and then perform vector embedding of all the documents

In [3]:

index=0
papers = []
with open('/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json', 'r') as file:
    for line in file:
        index += 1
        if index < 1000:
            papers.append(json.loads(line))

# Now data is a list of dictionaries
print("Headers:", list(papers[0].keys()))

Headers: ['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi', 'report-no', 'categories', 'license', 'abstract', 'versions', 'update_date', 'authors_parsed']


In [4]:
def remove_newlines(obj):
    if isinstance(obj, str):
        return obj.replace('\n', ' ')
        
preprocessed_papers = []
for paper in papers:
    preprocessed_papers.append("PAPER TITLE: " + remove_newlines(paper["title"]) + "\nPAPER CONTENT: "+ remove_newlines(paper["abstract"]))
print(preprocessed_papers[0])

PAPER TITLE: Calculation of prompt diphoton production cross sections at Tevatron and   LHC energies
PAPER CONTENT:   A fully differential calculation in perturbative quantum chromodynamics is presented for the production of massive photon pairs at hadron colliders. All next-to-leading order perturbative contributions from quark-antiquark, gluon-(anti)quark, and gluon-gluon subprocesses are included, as well as all-orders resummation of initial-state gluon radiation valid at next-to-next-to-leading logarithmic accuracy. The region of phase space is specified in which the calculation is most reliable. Good agreement is demonstrated with data from the Fermilab Tevatron, and predictions are made for more detailed tests with CDF and DO data. Predictions are shown for distributions of diphoton pairs produced at the energy of the Large Hadron Collider (LHC). Distributions of the diphoton pairs from the decay of a Higgs boson are contrasted with those produced from QCD processes at the LHC, s

In [5]:
test = preprocessed_papers[:99]
papers_embedded = client.models.embed_content(
    model='models/text-embedding-004',
    contents=test,
    config=types.EmbedContentConfig(task_type='SEMANTIC_SIMILARITY'))

In [6]:
def batch(iterable, n=100):
    for i in range(0, len(iterable), n):
        yield iterable[i:i + n]

# Example usage:
papers_embedded = []  # Your list of inputs to embed
papers_batches = list(batch(preprocessed_papers, 100))

for batch in papers_batches:
    batch_embedded = client.models.embed_content(
        model='models/text-embedding-004',
        contents=batch,
        config=types.EmbedContentConfig(task_type='SEMANTIC_SIMILARITY'))
    list_batch_embedded = [e.values for e in batch_embedded.embeddings]
    papers_embedded+=list_batch_embedded

In [7]:
print(len(papers_embedded))

999


In [8]:
import chromadb
from chromadb.utils import embedding_functions

# Start ChromaDB client
chromadb_client = chromadb.Client()

# Create or get a collection
collection = chromadb_client.get_or_create_collection(name="papers")

# Add the documents + embeddings to Chroma
collection.add(
    documents=preprocessed_papers,
    embeddings=papers_embedded,
    ids=[f"doc_{i}" for i in range(len(preprocessed_papers))],
)

In [9]:
query_input = "AI in medicine is advancing rapidly."

query_embedding = client.models.embed_content(
    model='models/text-embedding-004',
    contents=query_input,
    config=types.EmbedContentConfig(task_type='semantic_similarity')
)

In [10]:
print(query_embedding.embeddings[0].values)

[-0.0072084945, 0.032835763, -0.0076321694, 0.0441256, 0.014222593, 0.080331065, -0.009582624, 0.0478988, -0.015623617, -0.020880362, -0.018681057, 0.116436474, -0.020217746, 0.07474686, 0.059561577, 0.0021396584, 0.054528337, 0.025532307, -0.07986759, 0.010551534, -0.024767349, -0.029090183, 0.0069016227, -0.010615683, -0.037870336, -0.011856086, 0.013498534, -0.03406914, 0.043679215, -0.01100847, 0.037547767, 0.019760754, 0.027061056, -0.07858725, 0.042850535, 0.012470506, -0.005259861, -0.05559996, 0.013575055, 0.001238789, -0.012703826, -0.03596105, -0.04938766, 0.012840443, -0.024864934, -0.020305406, -0.04950192, -0.0068233674, -0.0030030597, 0.00021488687, 0.009077017, -0.061689775, 0.020063864, -0.0407702, -0.010600936, -0.105555914, -0.026397409, -0.039371137, 0.106510654, -0.033120945, -0.054003056, -0.01175733, 0.03745866, -0.028885433, 0.08967864, -0.04809001, -0.052880947, 0.012537232, -0.04276611, -0.0030630305, -0.06401076, -0.037834175, -0.067914024, 0.041173358, 0.0300

In [11]:
results = collection.query(
    query_embeddings=[query_embedding.embeddings[0].values],
    n_results=5  # Number of similar docs to return
)

for doc, doc_id in zip(results["documents"][0], results["ids"][0]):
    print(f"ID: {doc_id}")
    print(f"Document: {doc}\n")

ID: doc_984
Document: PAPER TITLE: Architecture for Pseudo Acausal Evolvable Embedded Systems
PAPER CONTENT:   Advances in semiconductor technology are contributing to the increasing complexity in the design of embedded systems. Architectures with novel techniques such as evolvable nature and autonomous behavior have engrossed lot of attention. This paper demonstrates conceptually evolvable embedded systems can be characterized basing on acausal nature. It is noted that in acausal systems, future input needs to be known, here we make a mechanism such that the system predicts the future inputs and exhibits pseudo acausal nature. An embedded system that uses theoretical framework of acausality is proposed. Our method aims at a novel architecture that features the hardware evolability and autonomous behavior alongside pseudo acausality. Various aspects of this architecture are discussed in detail along with the limitations. 

ID: doc_303
Document: PAPER TITLE: The World as Evolving Inform