In [2]:
from langchain_openai import ChatOpenAI
import os 
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [4]:
import xarray as xr

In [5]:
from arraylake import Client

In [6]:
from langchain.agents import initialize_agent, AgentType
from langchain.tools import Tool, StructuredTool
from langchain_experimental.utilities import PythonREPL
from langchain.prompts import MessagesPlaceholder
from langchain.agents import AgentExecutor, create_tool_calling_agent

In [7]:
import uuid


In [8]:
HF_TOKEN = os.environ["HF_TOKEN"]

In [9]:
llm = ChatOpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key=HF_TOKEN,
    model="openai/gpt-oss-20b:fireworks-ai" 
)

In [10]:
python_repl = PythonREPL()

In [11]:
python_repl_tool = Tool(
    name="python_repl",
    func=python_repl.run,
    description="""
    You are a Python REPL specialized for scientific data analysis.

    You receive:
    - Dataset information (including how to access it) from Tool 1.
    - A file path to the downloaded dataset from Tool 2.
    - Optionally, an example analysis function from Tool 1 that may be relevant to the query.
    
    Your job:
    1. If an example function is provided and fits the user’s query, use it directly with the dataset.
    2. If no suitable example function is provided, write your own analysis code using standard scientific Python packages such as:
       - xarray (for handling datasets)
       - matplotlib (for plotting, with clean labels and colorbars)
       - numpy (for computations)
       - cartopy or geopandas (if maps are needed)
       - cmocean or matplotlib colormaps (for nice scientific colormaps)
    3. Save all plots to files (e.g., `output.png`) and print the filename.
    4. Keep your code clean, minimal, and runnable in a single Python cell.
    5. If user’s request cannot be completed, return a helpful error message explaining why.
    
    Guidelines:
    - Always open the dataset using xarray from the provided file path.
    - Assume the dataset may be large: use efficient operations (`.sel`, `.isel`, `.mean`, `.plot`, etc.).
    - Include titles, axis labels, and legends where appropriate.
    - Prefer clarity and readability of code over cleverness.
    - Do not invent dataset fields—only use those provided in the dataset info.

    """,
)

In [18]:
import xarray as xr
from pathlib import Path
import uuid

def read_io(start: str | None = None, end: str | None = None, save: bool = True) -> xr.Dataset:
    """
    Load the Indian Ocean dataset from GCS, optionally filter by time,
    and optionally save to llm_downloads/<uuid>.nc.
    
    Parameters
    ----------
    start : str, optional
        Start time (inclusive), e.g., "2000-01-01".
    end : str, optional
        End time (inclusive), e.g., "2005-12-31".
    save : bool, default=True
        If True, saves the dataset to llm_downloads/<uuid>.nc.
    
    Returns
    -------
    xr.Dataset
        The loaded (and optionally filtered) dataset.
    """
    dataset = xr.open_dataset(
        "gcs://nmfs_odp_nwfsc/CB/mind_the_chl_gap/IO.zarr",
        engine="zarr",
        backend_kwargs={"storage_options": {"token": "anon"}},
        consolidated=True
    )

    if start or end:
        dataset = dataset.sel(time=slice(start, end))

    if save:
        path = Path("llm_downloads") / f"{uuid.uuid4()}.nc"
        path.parent.mkdir(parents=True, exist_ok=True)
        dataset.to_netcdf(path)
        print(f"Saved dataset to {path}")

    return path


In [19]:
io_tool = StructuredTool.from_function(
    func=read_io,
    name="read_indian_ocean",
    description="loads the indian ocean dataset and returns the path it's saved to."
)

In [20]:
tools = [python_repl_tool, io_tool]

In [21]:
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are a data analyst that can answer general questions about data, as well as make plots and do some basic analysis using a python REPL tool. Use the path returned from the tools to read files for analysis. If it fails, keep trying. You read the file from "),
        ("user", "{input}"),
        MessagesPlaceholder(variable_name="agent_scratchpad"),
    ]
)

In [22]:
agent = create_tool_calling_agent(
    llm=llm,
    tools=tools,
    prompt=prompt,
)

agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)

In [None]:
r = agent_executor.invoke({"input": "can you please plot a variable of your choice from the indian ocean dataset. just do 1 hour worth, i.e. provide a start and end to the function."})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `read_indian_ocean` with `{'start': '2020-01-01T00:00:00', 'end': '2020-01-01T01:00:00', 'save': True}`


[0mSaved dataset to llm_downloads/736b530c-b121-410a-b1d6-e59573b59bbd.nc
[33;1m[1;3mllm_downloads/736b530c-b121-410a-b1d6-e59573b59bbd.nc[0m[32;1m[1;3m
Invoking: `python_repl` with `import xarray as xr
import matplotlib.pyplot as plt

# Open dataset
path = 'llm_downloads/736b530c-b121-410a-b1d6-e59573b59bbd.nc'
# open as chunked
ds = xr.open_dataset(path, chunks={})
# choose variable, e.g., 'sst'
var = 'sst'
if var not in ds:
    # find similar
    var = 'sea_surface_temperature'

# subset time
subset = ds[var].sel(time=slice('2020-01-01T00:00:00','2020-01-01T01:00:00'))
# plot first timestamp
plt.figure(figsize=(8,4))
subset.isel(time=0).plot(cmap='viridis')
plt.title(f'{var} on {subset.time.values[0]}')
plt.savefig('output.png')
print('Saved plot to output.png')`


[0m