In [1]:
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain.chains import ConversationalRetrievalChain
from langchain.schema import Document
import json
import os

In [2]:
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser

In [3]:
json_path = 'datasets.json'
def load_safe_desc(path: str) -> str:
    """
    Load a JSON file, dump it as indented text,
    and escape curly braces for LangChain prompts.
    """
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    # Convert dict/list to formatted JSON string
    text = json.dumps(data, ensure_ascii=False, indent=2)
    
    # Escape braces so LangChain doesn't treat them as variables
    safe_text = text.replace("{", "{{").replace("}", "}}")
    
    return safe_text

# Example usage
safe_desc = load_safe_desc(json_path)

In [10]:
def read_examples(filename: str):
    """
    Reads a JSON file containing code description examples
    and returns them as a list of dictionaries.
    """
    with open(filename, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

In [11]:
examples = read_examples("code_descriptions.json")

In [None]:
def create_db_examples(examples):
    docs = []
    for doc in examples:
        # use the definition as the content, and keep the term (and letter) as metadata
        docs.append(
            Document(
                page_content=doc['page_content'],
                metadata=doc['metadata']  
            )
        )
    vector_store_hf = Chroma.from_documents(
                                            docs,
                                            doc_embedder,
                                            persist_directory="./chroma_db_examples",
                                        )
    vector_store_hf.persist()
    return vector_store_hf

In [None]:
examples = read_examples("code_descriptions.json")
vector_store_hf = create_db_examples(examples)

In [4]:
SYSTEM_PROMPT = f"""
 Pick the best single dataset and variable (might be several) using only the task description.
[TASK DESCRIPTION]
{safe_desc}

[OUTPUT SCHEMA — return ONLY these fields in this order]
dataset: <dataset name or "none">
variable: <variable name or "none">
lat,lon boundaries : <[lat_min, lat_max], [lon_min, lon_max] or "global">
time range: <YYYY-MM-DD to YYYY-MM-DD or "full available">
suggestions (from description only): <where this variable is available (region/coverage) if stated; else "none">

[DECISION RULES]
- Choose the most specific dataset & variables (can be more than one) explicitly supported by the description.
- If region/time are missing, use "global" and "full available".
- If no suitable match exists, set dataset and variable to "none".
- Suggestions must reflect ONLY what the description states (no external inference).
"""

In [5]:
def load_safe_desc(path: str) -> str:
    """
    Load a JSON file, dump it as indented text,
    and escape curly braces for LangChain prompts.
    """
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    # Convert dict/list to formatted JSON string
    text = json.dumps(data, ensure_ascii=False, indent=2)
    
    # Escape braces so LangChain doesn't treat them as variables
    safe_text = text.replace("{", "{{").replace("}", "}}")
    
    return safe_text

In [6]:
def get_example_of_visualizations(query: str) -> str:
    """
    Retrieves example visualizations related to the query.

    Parameters:
    - query (str): The user's query about plotting.

    Returns:
    - str: The content of the most relevant example file.
    """
    vector_store_hf = Chroma(
        persist_directory="./chroma_db_examples",
        embedding_function=doc_embedder
    )
    # Perform a similarity search
    results = vector_store_hf.similarity_search_with_score(query, k=1)

    # Extract the most relevant document
    doc, score = results[0]

    # Construct the full path to the txt file
    file_name = doc.metadata['source'].lstrip('./')
    full_path = os.path.join('./', file_name)

    # Read and return the content of the txt file
    try:
        with open(full_path, 'r', encoding='utf-8') as file:
            content = file.read()
            return content
    except Exception as e:
        print(f"An error occurred while reading the file: {str(e)}")
        return ""  # Return empty string if error occurs

In [7]:
def description_reader(query):
    json_path = 'datasets.json'
    safe_desc = load_safe_desc(json_path)
    llm = ChatOpenAI(
        base_url="https://router.huggingface.co/v1",
        api_key=HF_TOKEN,
        
        model="openai/gpt-oss-20b:fireworks-ai"  
    )
    
    # Create the prompt template
    prompt = ChatPromptTemplate.from_messages([
        ("system", SYSTEM_PROMPT),
        ("human", "{question}")
    ])
    
    # Create the chain with output parser
    chain_no_context = prompt | llm | StrOutputParser()
    response = chain_no_context.invoke({"question": query})
    example = get_example_of_visualizations(query)
    final_answer = response + ' you can use this code to analyse the data/n/n: ' + example
    return final_answer

In [8]:
from typing import Optional, Literal
from pydantic import BaseModel, Field, conint, confloat
from langchain_core.tools import StructuredTool
class adviserPARAM(BaseModel):
    query: str = Field(
        ..., description="User query"
    ) 

In [9]:
adviser_tool = StructuredTool.from_function(
    description_reader,
    name="adviser_tool",
    description=(
        "Use this tool to find a suitable dataset and code example"
    ),
    args_schema=adviserPARAM,
)

In [None]:
from langchain.agents import AgentExecutor, create_tool_calling_agent
from langchain.prompts import MessagesPlaceholder

# Use your existing tools
tools = [adviser_tool]
HF_TOKEN = '' ### ---add your API key

from langchain_huggingface import HuggingFaceEndpointEmbeddings
doc_embedder = HuggingFaceEndpointEmbeddings(
    model="Qwen/Qwen3-Embedding-8B",
    task="feature-extraction",
    model_kwargs={"normalize": True},
    huggingfacehub_api_token = HF_TOKEN
    
)
# System & user prompt for the agent
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are an expert in climate data analysis, you have adviser tool, which can help you to asnwer user's questions about variables/datasets. If the question about data, use only information from adviser_tool"),
        ("user", "{input}"),
        MessagesPlaceholder(variable_name="agent_scratchpad"),
    ]
)
llm = ChatOpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key=HF_TOKEN,
    model="openai/gpt-oss-120b:fireworks-ai"  
)

# Define the agent
agent = create_tool_calling_agent(
    llm=llm,
    tools=tools,
    prompt=prompt,
)

# Create the executor
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)