In [28]:
%%capture

%pip install -U arxiv dotenv anthropic google

In [29]:
import arxiv
import json
import os
from typing import List
from dotenv import load_dotenv

In [30]:
PAPER_DIR = "papers"

In [31]:
def search_papers(topic: str, max_results: int = 5) -> List[str]:
    """
    Search for papers on arXiv based on a topic and store their information.
    
    Args:
        topic: The topic to search for
        max_results: Maximum number of results to retrieve (default: 5)
        
    Returns:
        List of paper IDs found in the search
    """
    
    # Use arxiv to find the papers 
    client = arxiv.Client()

    # Search for the most relevant articles matching the queried topic
    search = arxiv.Search(
        query = topic,
        max_results = max_results,
        sort_by = arxiv.SortCriterion.Relevance
    )

    papers = client.results(search)
    
    # Create directory for this topic
    path = os.path.join(PAPER_DIR, topic.lower().replace(" ", "_"))
    os.makedirs(path, exist_ok=True)
    
    file_path = os.path.join(path, "papers_info.json")

    # Try to load existing papers info
    try:
        with open(file_path, "r") as json_file:
            papers_info = json.load(json_file)
    except (FileNotFoundError, json.JSONDecodeError):
        papers_info = {}

    # Process each paper and add to papers_info  
    paper_ids = []
    for paper in papers:
        paper_ids.append(paper.get_short_id())
        paper_info = {
            'title': paper.title,
            'authors': [author.name for author in paper.authors],
            'summary': paper.summary,
            'pdf_url': paper.pdf_url,
            'published': str(paper.published.date())
        }
        papers_info[paper.get_short_id()] = paper_info
    
    # Save updated papers_info to json file
    with open(file_path, "w") as json_file:
        json.dump(papers_info, json_file, indent=2)
    
    print(f"Results are saved in: {file_path}")
    
    return paper_ids

In [33]:
search_papers("llm")

Results are saved in: papers/llm/papers_info.json


['2412.18022v1',
 '2406.10300v1',
 '2405.19888v1',
 '2311.10372v2',
 '2411.15764v1']

In [34]:
def extract_info(paper_id: str) -> str:
    """
    Search for information about a specific paper across all topic directories.
    
    Args:
        paper_id: The ID of the paper to look for
        
    Returns:
        JSON string with paper information if found, error message if not found
    """
 
    for item in os.listdir(PAPER_DIR):
        item_path = os.path.join(PAPER_DIR, item)
        if os.path.isdir(item_path):
            file_path = os.path.join(item_path, "papers_info.json")
            if os.path.isfile(file_path):
                try:
                    with open(file_path, "r") as json_file:
                        papers_info = json.load(json_file)
                        if paper_id in papers_info:
                            return json.dumps(papers_info[paper_id], indent=2)
                except (FileNotFoundError, json.JSONDecodeError) as e:
                    print(f"Error reading {file_path}: {str(e)}")
                    continue
    
    return f"There's no saved information related to paper {paper_id}."

In [35]:
extract_info('2412.18022v1')

'{\n  "title": "Trustworthy and Efficient LLMs Meet Databases",\n  "authors": [\n    "Kyoungmin Kim",\n    "Anastasia Ailamaki"\n  ],\n  "summary": "In the rapidly evolving AI era with large language models (LLMs) at the core,\\nmaking LLMs more trustworthy and efficient, especially in output generation\\n(inference), has gained significant attention. This is to reduce plausible but\\nfaulty LLM outputs (a.k.a hallucinations) and meet the highly increased\\ninference demands. This tutorial explores such efforts and makes them\\ntransparent to the database community. Understanding these efforts is essential\\nin harnessing LLMs in database tasks and adapting database techniques to LLMs.\\nFurthermore, we delve into the synergy between LLMs and databases, highlighting\\nnew opportunities and challenges in their intersection. This tutorial aims to\\nshare with database researchers and practitioners essential concepts and\\nstrategies around LLMs, reduce the unfamiliarity of LLMs, and insp

In [None]:
import google.generativeai as genai
from google.generativeai.types import FunctionDeclaration, Tool # These should be stable imports

# Configure your API key
genai.configure(api_key="<>") # Replace with your actual API key

# Your tools definition (using dictionary format for parameters, as discussed in the last successful step):
tools = [
    Tool(
        function_declarations=[
            FunctionDeclaration(
                name="search_papers",
                description="Search for papers on arXiv based on a topic and store their information.",
                parameters={
                    "type": "OBJECT",
                    "properties": {
                        "topic": {
                            "type": "STRING",
                            "description": "The topic to search for"
                        },
                        "max_results": {
                            "type": "INTEGER",
                            "description": "Maximum number of results to retrieve"
                        }
                    },
                    "required": ["topic"]
                }
            )
        ]
    ),
    Tool(
        function_declarations=[
            FunctionDeclaration(
                name="extract_info",
                description="Search for information about a specific paper across all topic directories.",
                parameters={
                    "type": "OBJECT",
                    "properties": {
                        "paper_id": {
                            "type": "STRING",
                            "description": "The ID of the paper to look for"
                        }
                    },
                    "required": ["paper_id"]
                }
            )
        ]
    )
]


def process_query_gemini(query, tools, execute_tool):
    # Initialize the Gemini model with your tools
    model = genai.GenerativeModel('gemini-1.5-flash', tools=tools) # Or 'gemini-1.5-pro'

    # Initialize message history for Gemini
    messages = [{'role': 'user', 'parts': [{'text': query}]}] # Ensure initial message is in correct format

    process_query_loop = True
    while process_query_loop:
        try:
            response = model.generate_content(messages)

            if response.candidates and response.candidates[0].content:
                assistant_parts = []

                for part in response.candidates[0].content.parts:
                    if hasattr(part, 'text') and part.text: # Check for text attribute
                        print(part.text)
                        assistant_parts.append({'text': part.text}) # Append as dictionary if pure text

                        # If it's just text, and no other parts, we might be done
                        if len(response.candidates[0].content.parts) == 1:
                            process_query_loop = False

                    elif hasattr(part, 'function_call') and part.function_call: # Check for function_call attribute
                        # Append the function call as a dictionary for the history
                        assistant_parts.append({
                            "function_call": {
                                "name": part.function_call.name,
                                "args": {k: v for k, v in part.function_call.args.items()}
                            }
                        })

                        tool_name = part.function_call.name
                        tool_args = {k: v for k, v in part.function_call.args.items()}

                        print(f"Calling tool {tool_name} with args {tool_args}")

                        result = execute_tool(tool_name, tool_args)

                        # Append the assistant's response (tool call) and the user's tool result to messages
                        messages.append({'role': 'model', 'parts': assistant_parts})
                        messages.append({
                            'role': 'user',
                            'parts': [{ # Use dictionary for function_response
                                "function_response": {
                                    "name": tool_name,
                                    "response": {'result': result} # Wrap result in a dict as expected by API
                                }
                            }]
                        })

                        # Continue the loop to get the model's next response
                        break # Break from the current part processing to get a new model response
                else:
                    # If we processed all parts and didn't break for a tool call, we might be done
                    if not any(hasattr(p, 'function_call') for p in response.candidates[0].content.parts):
                        process_query_loop = False
            else:
                print("No response content from Gemini.")
                process_query_loop = False

        except Exception as e:
            print(f"An error occurred during API call: {e}")
            process_query_loop = False

# Example execute_tool (you MUST have this defined in your actual code)
def execute_tool(tool_name, tool_args):
    if tool_name == "search_papers":
        topic = tool_args.get("topic")
        max_results = tool_args.get("max_results", 5) # Handle default here
        if topic:
            return f"Successfully searched for {max_results} papers on {topic}."
        else:
            return "Error: Topic is required for search_papers."
    elif tool_name == "extract_info":
        paper_id = tool_args.get("paper_id")
        if paper_id:
            return f"Successfully extracted information for paper ID {paper_id}."
        else:
            return "Error: Paper ID is required for extract_info."
    return f"Unknown tool: {tool_name}"

# Example usage:
# process_query_gemini("Find me 3 recent papers on large language models.", tools, execute_tool)
# process_query_gemini("Extract information for paper ID 2305.01234.", tools, execute_tool)
# process_query_gemini("Tell me a simple joke.", tools, execute_tool)

In [37]:
def chat_loop():
    print("Type your queries or 'quit' to exit.")
    while True:
        try:
            query = input("\nQuery: ").strip()
            if query.lower() == 'quit':
                break
    
            process_query_gemini(query, tools, execute_tool)
            print("\n")
        except Exception as e:
            print(f"\nError: {str(e)}")

In [38]:
chat_loop()

Type your queries or 'quit' to exit.
Calling tool extract_info with args {'paper_id': '2412.18022v1'}
OK. I've extracted the information for paper 2412.18022v1.  Do you need me to display it, or is there anything else I can help you with?



Display what?  I need more information about what you want me to display.  Please provide a more specific request.



Calling tool extract_info with args {'paper_id': '2412.18022v1'}
OK. I've extracted the information for paper 2412.18022v1.  Do you want me to display it?  The response only indicates success; the actual details aren't included in the output.  I need a more sophisticated API to get the full paper details.



