In [5]:
from langchain_openai import ChatOpenAI
import os 
from dotenv import load_dotenv
load_dotenv()

True

In [6]:
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [7]:
import xarray as xr

In [8]:
from arraylake import Client

In [9]:
from langchain.agents import initialize_agent, AgentType
from langchain.tools import Tool, StructuredTool
from langchain_experimental.utilities import PythonREPL
from langchain.prompts import MessagesPlaceholder
from langchain.agents import AgentExecutor, create_tool_calling_agent

In [10]:
import uuid


In [11]:
HF_TOKEN = os.environ["HF_TOKEN"]

In [12]:
llm = ChatOpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key=HF_TOKEN,
    model="openai/gpt-oss-20b:fireworks-ai" 
)

In [13]:
python_repl = PythonREPL()

In [14]:
python_repl_tool = Tool(
    name="python_repl",
    func=python_repl.run,
    description="""
    You are a Python REPL specialized for scientific data analysis.

    You receive:
    - Dataset information (including how to access it) from Tool 1.
    - A file path to the downloaded dataset from Tool 2.
    - Optionally, an example analysis function from Tool 1 that may be relevant to the query.
    
    Your job:
    1. If an example function is provided and fits the user’s query, use it directly with the dataset.
    2. If no suitable example function is provided, write your own analysis code using standard scientific Python packages such as:
       - xarray (for handling datasets)
       - matplotlib (for plotting, with clean labels and colorbars)
       - numpy (for computations)
       - cartopy or geopandas (if maps are needed)
       - cmocean or matplotlib colormaps (for nice scientific colormaps)
    3. Save all plots to files (e.g., `output.png`) and print the filename.
    4. Keep your code clean, minimal, and runnable in a single Python cell.
    5. If user’s request cannot be completed, return a helpful error message explaining why.
    6. After you are done showing your result, please give an explanation of what the user is seeing.
    
    Guidelines:
    - Always open the dataset using xarray from the provided file path.
    - Assume the dataset may be large: use efficient operations (`.sel`, `.isel`, `.mean`, `.plot`, etc.).
    - Include titles, axis labels, and legends where appropriate.
    - Prefer clarity and readability of code over cleverness.
    - Do not invent dataset fields—only use those provided in the dataset info.
    - If the user asks for a time range that you might think is too large, warn them and suggest re-execution.

    """,
)

In [15]:
import xarray as xr
from pathlib import Path
import uuid
import pandas as pd

def read_io(start: str | None = None, end: str | None = None, save: bool = True) -> xr.Dataset:
    """
    Load the Indian Ocean dataset from GCS, optionally filter by time,
    and optionally save to llm_downloads/<uuid>.nc.
    
    Parameters
    ----------
    start : str, optional
        Start time (inclusive, UTC timezone-aware), e.g., "2000-01-01".
    end : str, optional
        End time (inclusive, UTC timezone-aware), e.g., "2005-12-31".
    save : bool, default=True
        If True, saves the dataset to llm_downloads/<uuid>.nc.
    
    Returns
    -------
    xr.Dataset
        The loaded (and optionally filtered) dataset.
    """
    dataset = xr.open_dataset(
        "gcs://nmfs_odp_nwfsc/CB/mind_the_chl_gap/IO.zarr",
        engine="zarr",
        backend_kwargs={"storage_options": {"token": "anon"}},
        consolidated=True
    )

    if start or end:
        start_dt = pd.to_datetime(start, utc=True) if start else None
        end_dt = pd.to_datetime(end, utc=True) if end else None
        dataset = dataset.sel(time=slice(start_dt, end_dt))

    if save:
        path = Path("llm_downloads") / f"{uuid.uuid4()}.nc"
        path.parent.mkdir(parents=True, exist_ok=True)
        dataset.to_netcdf(path)
        print(f"Saved dataset to {path}")

    return path


In [16]:
io_tool = StructuredTool.from_function(
    func=read_io,
    name="read_indian_ocean",
    description="loads the indian ocean dataset and returns the path it's saved to."
)

In [17]:
tools = [python_repl_tool, io_tool]

In [18]:
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are a data analyst that can answer general questions about data, as well as make plots and do some basic analysis using a python REPL tool. Use the path returned from the tools to read files for analysis. If it fails, keep trying. You read the file from "),
        ("user", "{input}"),
        MessagesPlaceholder(variable_name="agent_scratchpad"),
    ]
)

In [19]:
agent = create_tool_calling_agent(
    llm=llm,
    tools=tools,
    prompt=prompt,
)

agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)

In [20]:
r = agent_executor.invoke({"input": "can you please plot a variable of your choice from the indian ocean dataset. just do 1 hour worth sometime in 2020, dont use timezone aware dates"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `read_indian_ocean` with `{'start': '2020-02-12', 'end': '2020-02-12', 'save': True}`


[0m

TypeError: Cannot compare tz-naive and tz-aware datetime-like objects.