In [1]:
from src import create_agent, ToolParameter

agent = create_agent(
    embedding_model=None,
    config_overrides={"rag_enabled": False},
)
print("Agent created.")
def ping_tool(text: str) -> str:
    return f"PONG: {text}"

agent.add_tool(
    name="ping",
    description="Simple echo ping.",
    function=ping_tool,
    parameters=[ToolParameter("text", "string", "Text to echo")],
)
print("Tool added.")

print(agent.chat("Say hello (no tools)."))


Agent created.
Tool added.


  from .autonotebook import tqdm as notebook_tqdm
Loading weights: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 199/199 [00:00<00:00, 1242.23it/s, Materializing param=pooler.dense.weight]                              
[1mBertModel LOAD REPORT[0m from: BAAI/bge-base-en-v1.5
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Hello! I'm an AI agent specialized in time series analysis and forecasting. I can help you with forecasting, anomaly detection, data exploration, and more. How can I assist you today?


In [2]:
print(agent.chat("Say bye (no tools)."))


Goodbye! Feel free to return whenever you need help with time series analysis. Have a great day! ðŸš€


In [3]:
# academic_agent.py
# -*- coding: utf-8 -*-
"""
AcademicAgent â€” Literature search + RAG over papers (with debug output)
"""
from __future__ import annotations
import io
import json
import re
import time
import uuid
import inspect
from dataclasses import dataclass, asdict, field
from typing import Any, Dict, List, Optional, Tuple, Iterable

import httpx
import numpy as np

# API SDKs
import arxiv
from semanticscholar import SemanticScholar  # noqa: F401  (kept for future use)
from habanero import Crossref

try:
    from pypdf import PdfReader
    HAS_PYPDF = True
except Exception:
    HAS_PYPDF = False

# Framework imports
from core import Agent, ToolParameter


from academic_agent import create_academic_agent
# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
# Example Usage
# â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
if __name__ == "__main__":
    agent = create_academic_agent()

    # PHASE 1 â€” SEARCH (no LLM)
    agent.search(
        "What are the latest transformer architectures for NLP?",
        per_source_limit=8,
        order="s2,arxiv,crossref",
        open_access_only=False,
    )

    # agent.ingest_all(use_pdf=True, chunk_size=512, overlap=128)



ModuleNotFoundError: No module named 'arxiv'

In [None]:
import re

# Very small English stopword set, focused on question boilerplate
_S2_STOPWORDS = {
    "what", "which", "who", "whom", "whose", "where", "when", "why", "how",
    "are", "is", "am", "was", "were", "be", "been", "being",
    "the", "a", "an", "of", "for", "to", "in", "on", "and", "or", "with",
    "latest", "recent", "new", "newest", "current", "state", "art",
    "paper", "papers", "article", "articles",
    "about", "regarding", "related", "using", "use", "based",
    "do", "does", "did", "can", "could", "should", "would",
}


def _simplify_query(q: str) -> str:
    """
    Heuristically simplify a natural-language question into a keyword-style
    query that works better with Semantic Scholar's /paper/search.

    Example:
        "What are the latest transformer architectures for NLP?"
        -> "transformer architectures nlp"
    """
    if not q:
        return q

    # Lowercase and strip surrounding whitespace
    q_clean = q.strip().lower()

    # Remove trailing ?! and other punctuation at ends
    q_clean = re.sub(r"[?!\.\,;:\s]+$", "", q_clean)

    # Tokenize: keep alphanumerics, drop everything else
    tokens = re.findall(r"[a-z0-9]+", q_clean)
    if not tokens:
        return q.strip()

    # Remove stopwords if we have enough tokens
    filtered = [t for t in tokens if t not in _S2_STOPWORDS]
    # If everything was stripped, fall back to original tokens
    if not filtered:
        filtered = tokens

    # Make a compact keyword query
    simplified = " ".join(filtered)

    # Avoid returning something *longer* than the original by mistake
    if len(simplified) > len(q.strip()):
        return q.strip()

    return simplified or q.strip()

print(_simplify_query_for_s2("What are the latest transformer architectures for NLP?"))

In [None]:
import os
import json
from typing import Any, Dict, List, Optional
import httpx

# Default fields from Semantic Scholar Graph API
S2_DEFAULT_FIELDS = (
    "paperId,title,authors,venue,year,publicationDate,publicationTypes,"
    "externalIds,url,openAccessPdf,abstract"
)


def s2_search(
    query: str,
    limit: int = 10,
    offset: int = 0,
    open_access_only: bool = False,
    fields: str = S2_DEFAULT_FIELDS,
    base_url: str = "https://api.semanticscholar.org/graph/v1",
) -> Dict[str, Any]:
    """
    Standalone Semantic Scholar Graph API search.
    Works directly inside Jupyter, no CLI, no dependencies on your agent.

    Returns:
        {
            "total": int,
            "offset": int,
            "items": [ list of paper dicts ],
            "rate_limited": bool,
            "retry_after": Optional[str],
            "raw": dict
        }
    """

    # Prepare API key if available
    api_key = os.getenv("S2_API_KEY") or os.getenv("SEMANTIC_SCHOLAR_API_KEY")
    headers = {
        "User-Agent": "StandaloneS2Search/1.0",
        "Accept": "application/json",
    }
    if api_key:
        headers["x-api-key"] = api_key
    else:
        print("[WARN] No S2_API_KEY found â€” you may hit rate limits.")

    client = httpx.Client(headers=headers, timeout=20.0)

    # Validate fields
    fields_list = [f.strip() for f in fields.split(",") if f.strip()]
    fields_param = ",".join(fields_list) if fields_list else S2_DEFAULT_FIELDS

    # Build request
    limit = max(1, min(int(limit), 100))
    offset = max(0, int(offset))

    params: Dict[str, Any] = {
        "query": query,
        "limit": limit,
        "offset": offset,
        "fields": fields_param,
    }

    # Server-side open access filter
    if open_access_only:
        params["openAccessPdf"] = ""

    url = base_url.rstrip("/") + "/paper/search"

    # Execute request
    try:
        resp = client.get(url, params=params)
        status = resp.status_code
        print(f"S2 HTTP {status}")
    except Exception as e:
        print("Request failed:", e)
        return {
            "total": None,
            "offset": None,
            "items": [],
            "rate_limited": False,
            "retry_after": None,
            "raw": {"error": str(e)},
        }

    # Handle 429 explicitly
    if resp.status_code == 429:
        retry_after = resp.headers.get("Retry-After")
        print(f"[RATE LIMITED] Retry-After={retry_after}")
        return {
            "total": None,
            "offset": None,
            "items": [],
            "rate_limited": True,
            "retry_after": retry_after,
            "raw": {"status": 429},
        }

    # Handle non-200-ish
    try:
        resp.raise_for_status()
    except Exception as e:
        print("Non-200 error from S2:", e)
        try:
            raw = resp.json()
        except Exception:
            raw = {"text": resp.text}
        return {
            "total": None,
            "offset": None,
            "items": [],
            "rate_limited": False,
            "retry_after": None,
            "raw": raw,
        }

    # Parse JSON
    try:
        raw = resp.json()
    except Exception as e:
        print("JSON parse error:", e)
        return {
            "total": None,
            "offset": None,
            "items": [],
            "rate_limited": False,
            "retry_after": None,
            "raw": {"text": resp.text},
        }

    # Interpret payload
    if not isinstance(raw, dict):
        print("Unexpected response format:", type(raw))
        return {
            "total": None,
            "offset": None,
            "items": [],
            "rate_limited": False,
            "retry_after": None,
            "raw": raw,
        }

    total = raw.get("total")
    offset_val = raw.get("offset")
    items = raw.get("data")

    # Some S2 deployments omit data when total==0
    if items is None:
        if isinstance(total, int) and total == 0:
            print("S2 returned total=0 with no data field.")
            items = []
        else:
            print("[WARN] Missing 'data' field:", raw)
            items = []

    return {
        "total": total,
        "offset": offset_val,
        "items": items,
        "rate_limited": False,
        "retry_after": None,
        "raw": raw,
    }


In [None]:
res = s2_search("transformer architecture nlp", limit=5)
res


In [None]:
!pip install PyMuPDF

In [None]:
!nltk.download('punkt_tab')