In [1]:
# config


pdagent_type = 'auto' #one of: auto, manual
llm_type = 'gpt-4o' #one of: gpt-3.5-turbo-0613, gpt-3.5-turbo-0125, gpt-3.5-turbo, gpt-4o
initialization_params = {} #eg json version

### Imports & data setup

In [2]:
import pandas as pd
import os
import json
import openai
import chromadb
import gradio as gr

from langchain.chains.query_constructor.base import AttributeInfo
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_core.messages import AIMessage, HumanMessage
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_retrieval_chain, create_history_aware_retriever
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
from langchain.agents.agent_types import AgentType
from langchain_experimental.tools import PythonAstREPLTool
from langchain_core.output_parsers.openai_tools import JsonOutputKeyToolsParser

if not os.environ.get("OPENAI_API_KEY"):
    import key
    key.init()
    assert os.environ.get('OPENAI_API_KEY')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
path = './model_experiment_fields_ScenarioMIP_CMIP_filename_dates.csv'
df = pd.read_csv(path)
df['collection'] = 'giss_cmip6'
df = df[['collection', 'MIP', 'model', 'experiment', 'variant', 'tableID', 'variable', 'grid', 'version', 'start_YM', 'end_YM', 'filename']]
df.columns = ['collection', 'MIP', 'model', 'experiment', 'variant', 'temporal resolution', 'variable', 'grid', 'version', 'start year', 'end year', 'filename']
df = df.astype(str)

def url(x):
    cols = '/'.join(x)
    return 'portal.nccs.nasa.gov/datashare/' + cols
df['URL'] = df.apply(lambda x: url(x), axis=1)

df['start year'] = df['start year'].apply(lambda x: x[:4]) #+'-'+x[4:])
df['end year'] = df['end year'].apply(lambda x: x[:4]) #+'-'+x[4:])

oldn = df.shape[0]
df = df.sort_values(df.columns.to_list(), ascending=True).drop_duplicates(
    subset=set(df.columns.to_list())-set(['version', 'filename', 'URL']),
    ignore_index=True, keep='last')
print('removed', oldn-df.shape[0], 'rows corresponding to old-version datasets')

path = '../../cmip6-cmor-tables/Tables'
jdfs = []
for fname in os.listdir(path):
    with open(os.path.join(path, fname)) as j:
        d = json.load(j)
    
    try: 
        jdf = pd.DataFrame.from_dict(d['variable_entry'], orient='index')
    except:
        print('skipping', fname, 'due to formatting issue')
        continue    
    
    jdf['temporal resolution'] = fname[len('CMIP6_'):-len('.json')]
    jdfs.append(jdf)
    
varsdf = pd.concat(jdfs)
varsdf.rename(columns={'out_name': 'variable'}, inplace=True)
varsdf = varsdf.drop_duplicates(subset=['variable', 'temporal resolution'])

dfm = df.merge(varsdf, how='left')
dfm.drop_duplicates(subset=set(dfm.columns.to_list())-set(['dimensions'])).shape
dfmsub = dfm.drop(['temporal resolution', 'grid', 'version', 'type', 'positive', 'valid_min', 'valid_max', 
        'ok_min_mean_abs', 'ok_max_mean_abs', 'flag_values', 'flag_meanings'], axis=1)

removed 6985 rows corresponding to old-version datasets
skipping CMIP6_coordinate.json due to formatting issue
skipping CMIP6_input_example.json due to formatting issue
skipping CMIP6_CV.json due to formatting issue
skipping CMIP6_formula_terms.json due to formatting issue


In [4]:
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "summarize the user's request in a few key words without punctuation. "
    "Do NOT attempt to answer the user's request. "
)
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
model = ChatOpenAI(model=llm_type, temperature=0)
chain_standalone = contextualize_q_prompt | model | StrOutputParser()

In [11]:
if pdagent_type == 'auto':
    class create_pd_agent_that_returns_just_dataframe:
        def __init__(self, model, df):
            system = f"""
                You are a climate scientist with a pandas dataframe `df` that lists and describes all datasets 
                within CMIP6. Here is the output of `df.head().to_markdown:` \
                {df.head().to_markdown()} \
                Given a colleague's query, EXECUTE Python code to find ROWs of `df` corresponding 
                to CMIP6 datasets *similar* to the query (not necessarily an exact match!). \
                    Return ONLY the entire rows of `df` and nothing else. EXECUTE the code yourself!!
                    Do it yourself!!"""
            self.pdagent = create_pandas_dataframe_agent(
                model,
                dfmsub,
                prefix=system,
                verbose=True,
                agent_type=AgentType.OPENAI_FUNCTIONS,
                allow_dangerous_code=True,
                handle_parsing_errors="You you believe that the data doesn't contain the information you are looking for, please try rephrasing your question.",
                return_intermediate_steps=True,
                response_format={ "type": "json_object" },
            )
        def invoke(self, query):
            # return self.pdagent.invoke(query)
            try:
                result = self.pdagent.invoke(query)
                return result['intermediate_steps'][0][1]
            except:
                return pd.DataFrame([])
            
    model = ChatOpenAI(temperature=0, model=llm_type)
    pdagent = create_pd_agent_that_returns_just_dataframe(model, dfmsub)
    
elif pdagent_type == 'manual':
    llm=ChatOpenAI(temperature=0, model="gpt-4o", )
    tool = PythonAstREPLTool(locals={"df": dfmsub})
    llm_with_tools = llm.bind_tools([tool], tool_choice=tool.name)
    parser = JsonOutputKeyToolsParser(key_name=tool.name, first_tool_only=True)

    system = f"""
    You are a climate scientist with a pandas dataframe `df` that lists and describes all datasets 
    within CMIP6. Here is the output of `df.head().to_markdown:` \
    {dfmsub.head().to_markdown()} \
    Given a colleague's query, find ROWs of `df` corresponding 
    to CMIP6 datasets *similar* to the query (not necessarily an exact match!). \
        Return ONLY the entire rows of `df` and nothing else. EXECUTE the code yourself!!
    """
    prompt = ChatPromptTemplate.from_messages([("system", system), ("human", "{question}")])
    pdagent = prompt | llm_with_tools | parser | tool



### Evaluate

In [12]:
queries = pd.read_excel('evals.xlsx')
queries.head()

Unnamed: 0,annotator,src,query,variable,frequency,start year,end year,MIP,experiment
0,Sonia,Us,What are some datasets relevant to extreme wea...,"pr, ua, etc",,,,,
1,Sonia,Us,"Plot the 10, 20, 50, and 100 yr return period ...",pr,day,,,historical,
2,Sonia,Us,Show me all the times where Ames Iowa has more...,tas,day,1980.0,2014.0,CMIP,historical
3,Sonia,Us,Show me the average annual number of days per ...,tasmax,day,2050.0,2070.0,ScenarioMIP,ssp<NNN>
4,Sonia,Us,You just showed me average annual number of da...,tasmax,day,2050.0,2070.0,ScenarioMIP,ssp<NNN>


In [13]:
def to_dictsets(r):
    for k,v in r.to_dict().items(): 
        if type(v)==str and ',' in v and k != 'query':
            r[k] = set(v.split(', '))
    return r

In [14]:
# %%capture
outs = []
for i, r in queries.iterrows():
    r = to_dictsets(r)
    conversation = {'input': r['query'], 'chat_history': []}
    standalone = chain_standalone.invoke(conversation)
    print(standalone)
    # model = ChatOpenAI(temperature=0, model=llm_type)
    # pdagent = create_pd_agent_that_returns_just_dataframe(model, dfmsub)
    # out = pdagent.invoke(standalone+'?')
    outs.append(pdagent.invoke(standalone+'?'))

datasets for extreme weather


[1m> Entering new AgentExecutor chain...[0m
Return period plot for maximum daily rainfall


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo create a return period plot for maximum daily rainfall, you typically need a time series of daily rainfall data. The return period plot (or return level plot) is used in extreme value analysis to estimate the probability of extreme events, such as heavy rainfall.

Here is a step-by-step guide to create a return period plot for maximum daily rainfall using Python:

1. **Load the data**: Ensure you have a time series of daily rainfall data.
2. **Fit an extreme value distribution**: Use the Generalized Extreme Value (GEV) distribution or another appropriate distribution.
3. **Calculate return levels**: Estimate the return levels for different return periods.
4. **Plot the return period plot**.

Below is an example code snippet to achieve this using Python and libraries like `pandas`, `numpy`, `scipy`, and

In [15]:
non_score_cols = set(['annotator', 'src', 'query'])
pts= 0
valids = 0
for i in range(len(outs)):
    out = outs[i]
    r = queries.iloc[i]
    if type(out) != pd.core.frame.DataFrame or len(out) == 0:
        continue
    
    valids += 1
    for c in r.index: # get cols where correct response is defined
        if c in non_score_cols:
            continue
        
        # print(c, pts, type(r[c]))
        if c == 'start year':
            pts += sum(out[c].astype(float) <= r[c])/len(out)
        elif c == 'end year':
            pts += sum(r[c] <= out[c].astype(float))/len(out)
        elif r.isna()[c] or (type(r[c])==str and 'etc' in r[c]):
            # scoring a string with NaN/etc, count all as correct
            pts += 1 # len(out)
        else:
            if type(r[c])=='str' and ',' in r[c]:
                # scoring a string with limited set correct answers
                vals = set(r[c].split(', '))
            else:
                # scoring a string with one correct answer
                vals = set([r[c]])
            pts += sum(out[c].isin(vals))/len(out)
        
        # print(c, vals)
        # print(sum(out[c].isin(vals)))
# pts = pts/len(out)
print(valids, pts)

0 0


In [10]:
queries

Unnamed: 0,annotator,src,query,variable,frequency,start year,end year,MIP,experiment
0,Sonia,Us,What are some datasets relevant to extreme wea...,"pr, ua, etc",,,,,
1,Sonia,Us,"Plot the 10, 20, 50, and 100 yr return period ...",pr,day,,,historical,
2,Sonia,Us,Show me all the times where Ames Iowa has more...,tas,day,1980.0,2014.0,CMIP,historical
3,Sonia,Us,Show me the average annual number of days per ...,tasmax,day,2050.0,2070.0,ScenarioMIP,ssp<NNN>
4,Sonia,Us,You just showed me average annual number of da...,tasmax,day,2050.0,2070.0,ScenarioMIP,ssp<NNN>
5,Sonia,Us,Get the average annual number of days rainfall...,pr,day,1980.0,2014.0,CMIP,amip
6,Sonia,Us,"Show me in the future, all the suitable places...","clt, pr, etc",,2015.0,,ScenarioMIP,ssp<NNN>
7,Sonia,Us,Show me a future simulation where there is a d...,"pr, tas, etc",mon,2015.0,,ScenarioMIP,ssp<NNN>
8,Sonia,Us,Tell me the average SPEI under SSP585 scenario...,"tas, pr",mon,2080.0,2100.0,ScenarioMIP,ssp585
