## Setup
Import and install the necessary libraries.

In [1]:
!pip uninstall -qqy jupyterlab kfp  # Remove unused conflicting packages
!pip install -qU "google-genai==1.7.0" "chromadb==0.6.3"
!pip install --upgrade pymupdf

from google import genai
from google.genai import types

import json




[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.7/144.7 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m611.1/611.1 kB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m60.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.9/100.9 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m5.8 MB/s[0

In [2]:
# API keys
from kaggle_secrets import UserSecretsClient

GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")
client = genai.Client(api_key=GOOGLE_API_KEY)

In [3]:
# Define a retry policy. The model might make multiple consecutive calls automatically
# for a complex query, this ensures the client retries if it hits quota limits.
from google.api_core import retry

is_retriable = lambda e: (isinstance(e, genai.errors.APIError) and e.code in {429, 503})

if not hasattr(genai.models.Models.generate_content, '__wrapped__'):
  genai.models.Models.generate_content = retry.Retry(
      predicate=is_retriable)(genai.models.Models.generate_content)

## Upload of Arxiv papers
First import the arxiv dataset and then perform vector embedding of all the documents. After the vector embedding, it is saved in a chromadb vector database. The arxiv dataset import is shown below.

In [4]:

amount_papers = 10000
papers = []
with open('/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json', 'r') as file:
    for i, line in enumerate(file):
        if i >= amount_papers:
            break
        papers.append(json.loads(line))

# Now data is a list of dictionaries
print("Headers:", list(papers[0].keys()))

Headers: ['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi', 'report-no', 'categories', 'license', 'abstract', 'versions', 'update_date', 'authors_parsed']


Only the title and the abstract of each paper will be embedded. The code below implements this preprocessing of the papers.

In [5]:
def remove_newlines(obj):
    if isinstance(obj, str):
        return obj.replace('\n', ' ')
        
preprocessed_papers = []
for paper in papers:
    preprocessed_papers.append("PAPER TITLE: " + remove_newlines(paper["title"]) + "\nPAPER CONTENT: "+ remove_newlines(paper["abstract"]))
print("SUCCESSFULLY PREPROCESSED "+ str(len(preprocessed_papers)) + " PAPERS")
print("--- EXAMPLE OF PREPROCESSED PAPER ---")
print(preprocessed_papers[0])

SUCCESSFULLY PREPROCESSED 10000 PAPERS
--- EXAMPLE OF PREPROCESSED PAPER ---
PAPER TITLE: Calculation of prompt diphoton production cross sections at Tevatron and   LHC energies
PAPER CONTENT:   A fully differential calculation in perturbative quantum chromodynamics is presented for the production of massive photon pairs at hadron colliders. All next-to-leading order perturbative contributions from quark-antiquark, gluon-(anti)quark, and gluon-gluon subprocesses are included, as well as all-orders resummation of initial-state gluon radiation valid at next-to-next-to-leading logarithmic accuracy. The region of phase space is specified in which the calculation is most reliable. Good agreement is demonstrated with data from the Fermilab Tevatron, and predictions are made for more detailed tests with CDF and DO data. Predictions are shown for distributions of diphoton pairs produced at the energy of the Large Hadron Collider (LHC). Distributions of the diphoton pairs from the decay of a Hi

Now the preprocessed papers are transformed into vector embeddings.

In [6]:
def batch(iterable, n=100):
    for i in range(0, len(iterable), n):
        yield iterable[i:i + n]

papers_embedded = []  
papers_batches = list(batch(preprocessed_papers, 100)) #limit of 100 embeddings per call

for batch in papers_batches:
    batch_embedded = client.models.embed_content(
        model='models/text-embedding-004',
        contents=batch,
        config=types.EmbedContentConfig(task_type='SEMANTIC_SIMILARITY'))
    list_batch_embedded = [e.values for e in batch_embedded.embeddings]
    papers_embedded+=list_batch_embedded

print("SUCCESSFULLY EMBEDDED "+ str(len(papers_embedded)) + " PAPERS")

SUCCESSFULLY EMBEDDED 10000 PAPERS


Once the vector embeddings of the papers are computed, these are stored into the chromadb database.

In [7]:
import chromadb
from chromadb import Documents, EmbeddingFunction, Embeddings
def batch(iterable, batch_size):
    for i in range(0, len(iterable), batch_size):
        yield iterable[i:i + batch_size]


# Start ChromaDB client
chromadb_client = chromadb.Client()

# Create or get a collection
collection = chromadb_client.get_or_create_collection(name="papers")

# Add the documents + embeddings to Chroma
emb_batches = list(batch(papers_embedded, 41000))
papers_batches = list(batch(preprocessed_papers, 41000))
for i in range(len(emb_batches)):
    ids_batch = [f"doc_{j + i * 41000}" for j in range(len(emb_batches[i]))]
    collection.add(
        documents=papers_batches[i],
        embeddings=emb_batches[i],
        ids=ids_batch,
    )
print("SUCCESSFULLY UPLOADED "+ str(len(papers_embedded)) + " PAPERS")

SUCCESSFULLY UPLOADED 10000 PAPERS


## Vector database search example

Now an example paper is used to search for similar papers in the database. If the same paper is obtained, the queried paper was in the database.

In [8]:
query_input = "Statistical modeling of experimental physical laws is based on the probability density function of measured variables. It is expressed by experimental data via a kernel estimator. The kernel is determined objectively by the scattering of data during calibration of experimental setup. A physical law, which relates measured variables, is optimally extracted from experimental data by the conditional average estimator. It is derived directly from the kernel estimator and corresponds to a general nonparametric regression. T"
#query_input = pdf_text

query_embedding = client.models.embed_content(
        model='models/text-embedding-004',
        contents=query_input,
        config=types.EmbedContentConfig(task_type='SEMANTIC_SIMILARITY'))

In [9]:
results = collection.query(
    query_embeddings=[query_embedding.embeddings[0].values],
    n_results=5  # Number of similar docs to return
)

for doc, doc_id in zip(results["documents"][0], results["ids"][0]):
    print(f"ID: {doc_id}")
    print(f"{doc}\n")

ID: doc_88
PAPER TITLE: A general approach to statistical modeling of physical laws:   nonparametric regression
PAPER CONTENT:   Statistical modeling of experimental physical laws is based on the probability density function of measured variables. It is expressed by experimental data via a kernel estimator. The kernel is determined objectively by the scattering of data during calibration of experimental setup. A physical law, which relates measured variables, is optimally extracted from experimental data by the conditional average estimator. It is derived directly from the kernel estimator and corresponds to a general nonparametric regression. The proposed method is demonstrated by the modeling of a return map of noisy chaotic data. In this example, the nonparametric regression is used to predict a future value of chaotic time series from the present one. The mean predictor error is used in the definition of predictor quality, while the redundancy is expressed by the mean square distan

## RETRIEVAL AUGMENTED GENERATION (RAG)
For retrieval augmented generation, the question of the user together with the document are used to search for useful papers. With the useful papers and the user input, an answer is generated. The steps are as follow:
1) Use a LLM to embed the user input question and input document for vector search.
2) Obtain the original documents from the vector search in the database.
3) Use the input question and input document and the original documents from the database to generate a response with a LLM.
4) Show the answer to the user.

### Orchestration functions
The following functions orchestrate the RAG:

- create_embedding(text): For a given text generates the corresponding vector embedding.
- search_embedded_documents(query_embedding, n): For a given vector, searches nearby vectors in the vector embeddings database.
- retrieve_documents(doc_id): For a given list of document ids, this function returns an extended information of each paper.

In [10]:
from google.genai import types

# === Tools ===
def create_embedding(text)-> list:
    print(f' - CALL: create_embedding({text[:20]})')
    vector_embedding = client.models.embed_content(
        model='models/text-embedding-004',
        contents=text,
        config=types.EmbedContentConfig(task_type='SEMANTIC_SIMILARITY')
    )
    return vector_embedding.embeddings[0].values

def search_embedded_documents(query_embedding:list[float], n:int)->list[str]:
    print(f' - CALL: search_embedded_documents(n = {n})')
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n
    )
    return results

def retrieve_documents(doc_ids:list[int])-> list[dict]:
    print(f' - CALL: retrieve_documents(IDS = {doc_ids})')
    papers_retrieved = []
    for doc_id in doc_ids:
        papers_retrieved.append(papers[doc_id])
        
    return papers_retrieved

In [11]:
import pymupdf

def extract_text_from_pdf(path):
    text = ""
    with pymupdf.open(path) as doc:
        for page in doc:
            text += page.get_text()
    return text


pdf_text = extract_text_from_pdf("/kaggle/input/unc-paper/2409.10655v2.pdf")

In [12]:

user_document = pdf_text[:1000]
user_message = "Find me related papers."

instruction = """
You are a helpful chatbot that processes inputs from users and generates an output JSON for vector search. 

Given a user message and a document, return:
{
  "embedding_query": "<summarized embedding query based on user message and document>",
  "num_documents": <integer number of documents to retrieve>
}

Ensure the output is a valid JSON object. 'embedding_query' should be a concise string that captures the main topic or keywords for semantic search. 'num_documents' should be inferred from the user message, defaulting to 5 if unspecified.
"""

contents = [
    types.Content(
        role="user", parts=[types.Part(text=user_message),types.Part(text=pdf_text)]
    )
]

response_init = client.models.generate_content(
    model="gemini-2.0-flash", 
    config=types.GenerateContentConfig(
        system_instruction=instruction,
        #tools=[orchestration_tools]
    ),
    contents = contents
)


In [13]:
import pymupdf
import re

class RAG_Scientific_chatbot:
    def chat(self, question:str, document_path:str):
               
        processed_input, user_document = self._process_input(question, document_path)
    
        embedding = self._create_embedding(processed_input["embedding_query"])
        
        search_output = self._search_embedded_documents(embedding, int(processed_input["num_documents"]))
        
        doc_ids = search_output['ids'][0]  
        numeric_ids = [int(doc.split('_')[1]) for doc in doc_ids]
        extended_info = self._retrieve_documents(numeric_ids)
        answer = self._generate_final_answer(question, user_document, search_output, extended_info)

        return answer


    # === Tools ===
    def _process_input(self, user_message: str, document_path:str):
        instruction = """
        You are a helpful chatbot that processes inputs from users and generates an output JSON for vector search. 
        
        Given a user message and a document, return this string output:
        
        {"embedding_query": "<summarized embedding query based on user message and document>",
         "num_documents": "<integer number of documents to retrieve>"}
        
        'embedding_query' should be a concise string that captures the main topic or keywords for semantic search. 'num_documents' should be inferred from the user message, defaulting to 5 if unspecified.
        """

        pdf_text = self._extract_text_from_pdf(document_path)
        
        contents = [
            types.Content(
                role="user", parts=[types.Part(text=user_message),types.Part(text=pdf_text)]
            )
        ]
        
        processed_input = client.models.generate_content(
            model="gemini-2.0-flash", 
            config=types.GenerateContentConfig(
                system_instruction=instruction,
            ),
            contents = contents
        )
        
        match = re.search(r'\{.*\}', processed_input.text, re.DOTALL)
        if match:
            clean_json_str = match.group(0)
            processed_input = json.loads(clean_json_str)
     
        return processed_input, pdf_text

    
    def _extract_text_from_pdf(self, path):
        text = ""
        with pymupdf.open(path) as doc:
            for page in doc:
                text += page.get_text()
        return text
        
        
    def _create_embedding(self, text)-> list:
        print(f' - CALL: create_embedding({text[:20]}...)')
        vector_embedding = client.models.embed_content(
            model='models/text-embedding-004',
            contents=text,
            config=types.EmbedContentConfig(task_type='SEMANTIC_SIMILARITY')
        )
        return vector_embedding.embeddings[0].values
    
    def _search_embedded_documents(self, query_embedding:list[float], n:int)->list[str]:
        print(f' - CALL: search_embedded_documents(n = {n})')
        results = collection.query(
            query_embeddings=[query_embedding],
            n_results=n
        )
        return results
    
    def _retrieve_documents(self, doc_ids:list[int])-> list[dict]:
        print(f' - CALL: retrieve_documents(IDS = {doc_ids})')
        papers_retrieved = []
        for doc_id in doc_ids:
            papers_retrieved.append(papers[doc_id])
            
        return papers_retrieved
        
    def _generate_final_answer(self,question: str, user_document: str, search_output: str, extended_info: str):
        instruction = """You are a helpful chatbot that uses retrieval augmented generation to answer questions regarding scientific papers. The user provided a QUESTION and an INPUT_DOCUMENT. 
                The information retreived from the database is composed of: 
                - EMBED_DATA: Shows the documents obtained after a vector search from the embedding of QUESTION and INPUT_DOCUMENT and the database.
                - EXTENDED_PAPER_INFO: Shows more information of the papers from EMBED_DATA.

                Answer the user QUESTION using the EMBED_DATA and EXTENDED_PAPER_INFO. IF the EMBED_DATA has the paper from INPUT_DOCUMENT, skip the paper in the answer.
                Show always the author names and the publishing date of the papers.
                """

        prompt =f"""
                QUESTION:{user_message}
                INPUT_DOCUMENT:{user_document}
                EMBED_DATA: {search_output}
                EXTENDED_PAPER_INFO: {extended_info}
            
        
        """
        
        contents = []
        contents.append(types.Content(role="user", parts=[types.Part(text = prompt)]))
        response_final = client.models.generate_content(
            model="gemini-2.0-flash", 
            config=types.GenerateContentConfig(
                system_instruction=instruction
            ),
            contents = contents
        )
        return response_final.text

In [14]:
from IPython.display import display, Markdown, Latex


document = "/kaggle/input/unc-paper/2409.10655v2.pdf"
user_message = "Find me related papers."

chatbot = RAG_Scientific_chatbot()
answer = chatbot.chat(user_message, document)
display(Markdown(answer))

 - CALL: create_embedding(Safe social navigati...)
 - CALL: search_embedded_documents(n = 5)
 - CALL: retrieve_documents(IDS = [4604, 3375, 4438, 1554, 1273])


Here are some related papers:

*   **Risk Assessment Algorithms Based On Recursive Neural Networks** by Alejandro Chinea Manrique De Lara and Michel Parent (2007). This paper introduces a novel approach to compute risk functions by using a combination of a highly non-linear processing model in conjunction with a powerful information encoding procedure.
*   **Flow of autonomous traffic on a single multi-lane street** by Federico Polito and Fergal Dalton (2007). This paper investigates the behavior of an original traffic model that considers a single multi-lane street populated by autonomous vehicles.
*   **Mixing navigation on networks** by Tao Zhou (2007). This paper proposes a mixing navigation mechanism that interpolates between random-walk and shortest-path protocol to enhance navigation efficiency.
*   **An information-based traffic control in a public conveyance system: reduced clustering and enhanced efficiency** by A. Tomoeda, K. Nishinari, D. Chowdhury and A. Schadschneider (2007). This paper proposes a new public conveyance model using stochastic cellular automaton to find the optimal density of vehicles and reduce clustering.
*   **Parametric Learning and Monte Carlo Optimization** by David H. Wolpert and Dev G. Rajnarayan (2007). This paper uncovers the close relationship between Monte Carlo Optimization, Parametric machine-Learning, and blackbox optimization.


## Appendix: Partial code for an AI Agent 

This code is provided for future improvement, given that the code did not work

In [15]:
# === Tool declarations ===
create_embedding_tool = {
    "name" : "create_embedding",
    "description" : "For a given text, generate the corresponding vector embedding.",
    "parameters" : {
        "type": "OBJECT",
        "properties": {
            "text": {
                "type": "STRING",
                "description": "The input text to embed."
            }
        },
        "required": ["text"]
    }
}

search_embedded_documents_tool = {
    "name" : "search_embedded_documents",
    "description" : "Search for similar documents using a query embedding.",
    "parameters" : {
        "type": "OBJECT",
        "properties": {
            "query_embedding": {
                "type": "ARRAY",
                "items": {
                    "type": "NUMBER"  # O "INTEGER" si tus vectores son int (normalmente son floats)
                },
                "description": "The vector embedding of the input query."
            },
            "n": {
                "type": "INTEGER",
                "description": "Number of top similar documents to retrieve."
            }
        },
        "required": ["query_embedding", "n"]
    }
}

retrieve_documents_tool = {
    "name" : "retrieve_documents",
    "description" : "Retrieve detailed information about a document using its ID.",
    "parameters" : {
        "type": "OBJECT",
        "properties": {
            "doc_id": {
                "type": "INTEGER",
                "description": "The ID of the paper/document."
            }
        },
        "required": ["doc_id"]
    }
}



In [16]:
#orchestration_tools  = types.Tool(function_declarations=[create_embedding_tool, search_embedded_documents_tool])

orchestration_tools  = types.Tool(function_declarations=[create_embedding_tool, search_embedded_documents_tool])


instruction = """You are a helpful chatbot that can interact with a database of vector embeddings
of scientific papers and a database with the papers extended information. You will take the users questions and documents andgenerate

Use the following tools:
    - create_embedding(text) to convert text into vector embeddings 
    - search_embedded_documents(query_embedding, n) to obtain n papers that are similar to the embedded query 
from the database of vector embeddings.

"""

In [17]:
#tool_call = response.candidates[0].content.parts[0].function_call

#if tool_call.name == "create_embedding":
#    result = create_embedding(**tool_call.args)

#function_response_part = types.Part.from_function_response(
#    name=tool_call.name,
#    response={"result": result},
#)

#contents.append(types.Content(role="model", parts=[types.Part(function_call=tool_call)])) # Append the model's function call message
#contents.append(types.Content(role="user", parts=[function_response_part])) # Append the function response
#response = client.models.generate_content(
#    model="gemini-2.0-flash", 
#    config=types.GenerateContentConfig(
#        system_instruction=instruction,
#        tools=[orchestration_tools]
#    ),
#    contents = contents
#)
#print(response)