In [4]:
from langchain_openai import ChatOpenAI
import os 
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [3]:
HF_TOKEN = os.environ["HF_TOKEN"]

In [9]:
llm = ChatOpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key=HF_TOKEN,
    model="openai/gpt-oss-20b:fireworks-ai" 
)

In [10]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "You shall answer my questions"),
    ("human", "{question}")
])

In [11]:
chain_no_context = prompt | llm | StrOutputParser()

In [12]:
resp = chain_no_context.invoke({"question": "hey buddy"})

In [13]:
resp

'Hey there! How can I help you today?'

In [14]:
from arraylake import Client

In [15]:
import xarray as xr

In [16]:
def read_arraylake_dataset(repo_name: str, group_name: str, variable: str = "all", branch: str = "main") -> xr.Dataset:
    
    client = Client()
    
    repo = client.get_repo(repo_name)
    
    session = repo.readonly_session(branch=branch)
    
    ds = xr.open_zarr(session.store, group=group_name, consolidated=False)

    if variable != "all":
        ds = ds[variable]
    
    return ds


In [None]:
read_arraylake_dataset(
    "earthmover-public/era5-surface-aws",
    "spatial",
    "sst"
)

In [20]:
def read_dataset(
    source: str,
    **kwargs
):
    if source == "arraylake":
        return read_arraylake_dataset(**kwargs)
    else:
        # this is where we will have functions for 
        # aws, gcp, etc.
        raise NotImplementedError()

In [23]:
ds = read_dataset(
    "arraylake",
    repo_name="earthmover-public/era5-surface-aws",
    group_name="spatial"
)

In [24]:
ds.variables

Frozen({'blh': <xarray.Variable (time: 438312, latitude: 721, longitude: 1440)> Size: 2TB
dask.array<open_dataset-blh, shape=(438312, 721, 1440), dtype=float32, chunksize=(1, 721, 1440), chunktype=numpy.ndarray>
Attributes: (12/14)
    long_name:                                          Boundary layer height
    short_name:                                         blh
    units:                                              m
    original_format:                                    WMO GRIB 1 with ECMWF...
    ecmwf_local_table:                                  128
    ecmwf_parameter:                                    159
    ...                                                 ...
    grid_specification:                                 0.25 degree x 0.25 de...
    rda_dataset:                                        ds633.0
    rda_dataset_url:                                    https:/rda.ucar.edu/d...
    rda_dataset_doi:                                    DOI: 10.5065/BH6N-5N20
    rd

In [3]:
from pydantic import BaseModel, Field, confloat
from typing import Literal

In [4]:
from langchain_core.tools import StructuredTool

In [5]:
 = StructuredTool.from_function(
    extract_arraylake_data,
    name="extract_arraylake_data",
    description=(),
    args_schema=ERA5Params,
)

SyntaxError: invalid syntax (2646928006.py, line 1)

In [2]:
from dataset import Dataset, SpatialBounds, TemporalBounds, Variables, Access, Variable, DatasetCollection

In [3]:
Dataset

dataset.Dataset

In [4]:
d = Dataset(
    name="Multi-Scale Ultra High Resolution (MUR) Sea Surface Temperature (SST)",
    description="A global, gap-free, gridded, daily 1 km Sea Surface Temperature (SST) dataset created by merging multiple Level-2 satellite SST datasets. Those input datasets include the NASA Advanced Microwave Scanning Radiometer-EOS (AMSR-E), the JAXA Advanced Microwave Scanning Radiometer 2 (AMSR-2) on GCOM-W1, the Moderate Resolution Imaging Spectroradiometers (MODIS) on the NASA Aqua and Terra platforms, the US Navy microwave WindSat radiometer, the Advanced Very High Resolution Radiometer (AVHRR) on several NOAA satellites, and in situ SST observations from the NOAA iQuam project. Data are available from 2002 to present in Zarr format. The original source of the MUR data is the NASA JPL Physical Oceanography DAAC.",
    spatial_bounds=SpatialBounds(
        min_lat=1.0,
        min_lon=1.0,
        max_lat=1.0,
        max_lon=1.0
    ),
    temporal_bounds=TemporalBounds(
        start_time="1234",
        end_time="4567"
    ),
    variables=Variables(
        variables=[Variable(standard_name="water temp", description="how hot da water")]
    ),
    access=Access(
        platform="aws",
        path="s3://path_to_file.zarr"
    )
)

In [9]:
json.loads(DatasetCollection(datasets=[d]).json())

{'datasets': [{'name': 'Multi-Scale Ultra High Resolution (MUR) Sea Surface Temperature (SST)',
   'description': 'A global, gap-free, gridded, daily 1 km Sea Surface Temperature (SST) dataset created by merging multiple Level-2 satellite SST datasets. Those input datasets include the NASA Advanced Microwave Scanning Radiometer-EOS (AMSR-E), the JAXA Advanced Microwave Scanning Radiometer 2 (AMSR-2) on GCOM-W1, the Moderate Resolution Imaging Spectroradiometers (MODIS) on the NASA Aqua and Terra platforms, the US Navy microwave WindSat radiometer, the Advanced Very High Resolution Radiometer (AVHRR) on several NOAA satellites, and in situ SST observations from the NOAA iQuam project. Data are available from 2002 to present in Zarr format. The original source of the MUR data is the NASA JPL Physical Oceanography DAAC.',
   'temporal_bounds': {'start_time': '1234', 'end_time': '4567'},
   'spatial_bounds': {'min_lat': 1.0,
    'min_lon': 1.0,
    'max_lat': 1.0,
    'max_lon': 1.0},
   '

In [8]:
import json

In [33]:
json.loads(d.json())

{'name': 'Multi-Scale Ultra High Resolution (MUR) Sea Surface Temperature (SST)',
 'description': 'A global, gap-free, gridded, daily 1 km Sea Surface Temperature (SST) dataset created by merging multiple Level-2 satellite SST datasets. Those input datasets include the NASA Advanced Microwave Scanning Radiometer-EOS (AMSR-E), the JAXA Advanced Microwave Scanning Radiometer 2 (AMSR-2) on GCOM-W1, the Moderate Resolution Imaging Spectroradiometers (MODIS) on the NASA Aqua and Terra platforms, the US Navy microwave WindSat radiometer, the Advanced Very High Resolution Radiometer (AVHRR) on several NOAA satellites, and in situ SST observations from the NOAA iQuam project. Data are available from 2002 to present in Zarr format. The original source of the MUR data is the NASA JPL Physical Oceanography DAAC.',
 'temporal_bounds': {'start_time': '1234', 'end_time': '4567'},
 'spatial_bounds': {'min_lat': 1.0,
  'min_lon': 1.0,
  'max_lat': 1.0,
  'max_lon': 1.0},
 'variables': {'variables': [{