# arXiv Quantum ML Paper Search & Curation

This notebook searches arXiv for Quantum Machine Learning papers and uses an LLM to score and curate them.

## 1. Imports and Configuration

In [1]:
import os
import time
import arxiv
from pymongo import MongoClient
from datetime import datetime
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv
import json

load_dotenv()

# --- Configuration ---
MONGO_URI = os.getenv("MONGO_URI", "mongodb://localhost:27017/")
DB_NAME = "arxiv_research"
COLLECTION_NAME = "qml_papers"

print("✓ Imports loaded")

✓ Imports loaded


## 2. Initialize LLM and Database

In [2]:
# --- LLM Configuration ---
gemini_llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    verbose=True,
    google_api_key=os.getenv("GEMINI_API_KEY")
)

# --- MongoDB Connection ---
client = MongoClient(MONGO_URI)
db = client[DB_NAME]
collection = db[COLLECTION_NAME]

print(f"✓ Connected to MongoDB: {DB_NAME}.{COLLECTION_NAME}")
print(f"✓ LLM initialized: {gemini_llm.model}")

✓ Connected to MongoDB: arxiv_research.qml_papers
✓ LLM initialized: models/gemini-2.0-flash


E0000 00:00:1759287983.545175 4111515 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


## 3. arXiv Search Function

In [None]:
def search_arxiv(category: str, query: str) -> list:
    """Searches arXiv for papers in specific categories using a targeted query.

    Args:
        category: arXiv category (e.g., 'quant-ph', 'cs.LG')
        query: Search query string

    Returns:
        List of paper dictionaries
    """
    print(f"Executing arXiv search in '{category}' for query: '{query}'...")
    search = arxiv.Search(
        query=f'cat:{category} AND ({query})',
        max_results=25,  # Limit results per search to keep it focused
        sort_by=arxiv.SortCriterion.SubmittedDate
    )

    results = []
    for result in search.results():
        # Avoid duplicates
        if collection.find_one({"entry_id": result.entry_id}):
            continue

        result_dict = {
            "entry_id": result.entry_id,
            "title": result.title,
            "authors": [author.name for author in result.authors],
            "summary": result.summary,
            "pdf_url": result.pdf_url,
            "published": result.published,
            "updated": result.updated,
            "primary_category": result.primary_category,
            "categories": result.categories
        }
        results.append(result_dict)

    print(f"Found {len(results)} new papers.")
    return results

print("✓ search_arxiv() defined")

## 4. Paper Curation Function (LLM-based)

In [None]:
def curate_papers(papers: list, guidance_context: str) -> list:
    """Curate and score papers using LLM.

    Args:
        papers: List of paper dictionaries from arXiv search
        guidance_context: Context string to guide scoring

    Returns:
        List of papers with added relevance_score, score_justification, and keywords
    """
    if not papers:
        return []

    curated_papers = []

    for paper in papers:
        prompt = f"""Analyze this research paper and provide a relevance score.

GUIDANCE CONTEXT:
{guidance_context}

PAPER DETAILS:
Title: {paper['title']}
Authors: {', '.join(paper['authors'])}
Abstract: {paper['summary']}
Categories: {', '.join(paper['categories'])}

TASK:
1. Assign a relevance score from 1 (low) to 10 (high) based on the guidance context
2. Write a brief (1-2 sentence) justification for your score
3. Extract 3-5 keywords or key concepts from the abstract

Respond ONLY with valid JSON in this exact format:
{{
    "relevance_score": <number 1-10>,
    "score_justification": "<your justification>",
    "keywords": ["keyword1", "keyword2", "keyword3"]
}}"""

        try:
            response = gemini_llm.invoke(prompt)
            # Extract JSON from response
            response_text = response.content if hasattr(response, 'content') else str(response)

            # Handle potential markdown formatting
            if "```json" in response_text:
                response_text = response_text.split("```json")[1].split("```")[0].strip()
            elif "```" in response_text:
                response_text = response_text.split("```")[1].split("```")[0].strip()

            curation_data = json.loads(response_text)

            # Merge original paper data with curation data
            curated_paper = {**paper, **curation_data}
            curated_papers.append(curated_paper)

            print(f"Scored '{paper['title'][:60]}...' -> {curation_data['relevance_score']}/10")

        except Exception as e:
            print(f"Error curating paper '{paper['title'][:60]}...': {e}")
            # Add paper without curation data
            curated_papers.append(paper)

    return curated_papers

print("✓ curate_papers() defined")

## 5. Define Search Parameters

In [None]:
# --- Dynamic Context for the Curator ---
guidance_context = (
    "My primary interest is in practical and near-term Quantum Machine Learning. "
    "Score papers higher if they mention: "
    "1. Specific algorithms like VQAs, QAOA, Quantum Kernels, or QNNs. "
    "2. Benchmarking against classical methods or other quantum algorithms. "
    "3. Implementations on actual quantum hardware or widely used simulators (like PennyLane). "
    "4. Association with major quantum computing companies (IBM, Google Quantum AI, Xanadu, D-Wave, etc.). "
    "Score lower if the paper is purely theoretical, highly abstract (e.g., quantum algebra), or lacks a clear connection to machine learning."
)

# --- Search Parameters ---
search_terms = [
    '"Quantum Machine Learning"', '"QML"', '"Quantum AI"',
    '"Variational Quantum Algorithm"', '"VQA"', '"Quantum Neural Network"',
    '"Quantum Kernel Method"', '"Quantum Support Vector Machine"',
    '"Quantum Annealing" AND "machine learning"',
    '"parameterized quantum circuit"'
]
query_string = " OR ".join([f'ti:"{term}" OR abs:"{term}"' for term in search_terms])

# arXiv categories to search
categories = [
    "quant-ph",  # Quantum Physics (Core)
    "cs.LG",     # Machine Learning (CS)
    "cs.AI",     # Artificial Intelligence (CS)
    "cond-mat.dis-nn",  # Disordered Systems and Neural Networks (Physics)
    "math-ph"    # Mathematical Physics
]

print(f"✓ Search parameters defined")
print(f"  Categories: {', '.join(categories)}")
print(f"  Search terms: {len(search_terms)} terms")

## 6. Test Search on Single Category

Let's test with just one category first to verify everything works.

In [None]:
# Test with a single category
test_category = "quant-ph"  # Change this to test different categories

print(f"\n--- Testing search for category: {test_category} ---")
papers = search_arxiv(test_category, query_string)

if papers:
    print(f"\nFound {len(papers)} papers. Here are the first 3:")
    for i, paper in enumerate(papers[:3], 1):
        print(f"\n{i}. {paper['title']}")
        print(f"   Authors: {', '.join(paper['authors'][:3])}...")
        print(f"   Published: {paper['published']}")
else:
    print("No new papers found (they may already be in the database).")

## 7. Test Curation on Sample Papers

Test the LLM curation on a small batch first.

In [None]:
# Test curation on first 2 papers (to avoid too many API calls)
if papers:
    print(f"\n--- Testing curation on first 2 papers ---")
    test_papers = papers[:2]
    
    curated = curate_papers(test_papers, guidance_context)
    
    print(f"\n--- Curation Results ---")
    for paper in curated:
        print(f"\nTitle: {paper['title']}")
        if 'relevance_score' in paper:
            print(f"Score: {paper['relevance_score']}/10")
            print(f"Justification: {paper['score_justification']}")
            print(f"Keywords: {', '.join(paper['keywords'])}")
        else:
            print("[Curation failed for this paper]")
else:
    print("No papers to curate.")

## 8. Insert Test Papers to Database

If curation worked, insert the test papers to MongoDB.

In [None]:
# Insert curated papers to database
if 'curated' in locals() and curated:
    print(f"\n--- Inserting {len(curated)} papers to database ---")
    
    for paper in curated:
        # Check for duplicates
        if not collection.find_one({"entry_id": paper.get("entry_id")}):
            paper['timestamp_added'] = datetime.utcnow()
            collection.insert_one(paper)
            print(f"✓ Inserted: {paper['title'][:60]}...")
        else:
            print(f"⊘ Already exists: {paper['title'][:60]}...")
    
    print("\nDone!")
else:
    print("No curated papers to insert.")

## 9. Run Full Search Job (All Categories)

Once testing is successful, run the full job across all categories.

In [None]:
def run_arxiv_search_job():
    """Runs the arXiv search and curation job."""
    print(f"\n--- Starting new arXiv search job at {datetime.now()} ---")

    for category in categories:
        print(f"\n--- Processing category: {category} ---")

        # Step 1: Search arXiv
        papers = search_arxiv(category, query_string)

        if not papers:
            print(f"No new papers found for category '{category}'.")
            continue

        # Step 2: Curate and score papers with LLM
        curated_papers = curate_papers(papers, guidance_context)

        # --- Database Insertion ---
        try:
            if curated_papers:
                for paper in curated_papers:
                    # Ensure we don't insert duplicates from a failed run
                    if not collection.find_one({"entry_id": paper.get("entry_id")}):
                        paper['timestamp_added'] = datetime.utcnow()
                        collection.insert_one(paper)
                print(f"Successfully inserted {len(curated_papers)} new papers into MongoDB for category '{category}'.")

        except Exception as e:
            print(f"An error occurred during database insertion for category '{category}': {e}")

# Uncomment to run:
# run_arxiv_search_job()

## 10. Query Database

View papers that have been inserted.

In [None]:
# Query papers from database
print(f"\n--- Papers in Database ---")
total_count = collection.count_documents({})
print(f"Total papers: {total_count}")

if total_count > 0:
    print(f"\nTop 5 papers by relevance score:")
    top_papers = collection.find(
        {"relevance_score": {"$exists": True}}
    ).sort("relevance_score", -1).limit(5)
    
    for i, paper in enumerate(top_papers, 1):
        print(f"\n{i}. {paper['title']}")
        print(f"   Score: {paper.get('relevance_score', 'N/A')}/10")
        print(f"   Keywords: {', '.join(paper.get('keywords', []))}")
        print(f"   PDF: {paper['pdf_url']}")