In [1]:
from prompts import *
import os
import key
from model import llm_init
from langchain_core.output_parsers import StrOutputParser
import re
from pprint import pprint
import utils
import pandas as pd
from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
from langchain_openai import ChatOpenAI, OpenAI
from langchain.agents import AgentExecutor

from langchain.agents import AgentExecutor, create_tool_calling_agent, tool
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

In [2]:
if not os.environ.get("OPENAI_API_KEY"):
    key.init()
assert os.environ.get('OPENAI_API_KEY')
    
dicts = utils.get_urls('./paths.txt')
df = pd.DataFrame.from_dict(dicts, orient='index')

docsVAR = utils.read_dictionary_from_file('./docsVAR.txt')
docsTR = utils.read_dictionary_from_file('./docsTR.txt')

df['def_variable']            = df['variable'].apply(lambda x: docsVAR.get(x, 'UNK'))
df['def_temporal_resolution'] = df['temporal_resolution'].apply(lambda x: docsTR.get(x, 'UNK'))

dfsub = df[(df['def_temporal_resolution']!='UNK') & (df['def_variable']!='UNK')]
print(f"Subset dataframe ratio: {len(dfsub)}/{len(df)}")

agent = create_pandas_dataframe_agent(
    ChatOpenAI(temperature=0, model="gpt-4-turbo"),
    dfsub,
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    allow_dangerous_code=True,
    handle_parsing_errors="You you believe that the data doesn't contain the information you are looking for, please try rephrasing your question.",
    return_intermediate_steps=True,
    #handle_parsing_errors= "If your output has an error, make sure strings are enclosed in double quotes, and line breaks within strings are properly escaped."
)

print("Testing agent...")
pprint(agent.invoke(" air temperature monthly resolution?"))

Length of dictionary:  223
Subset dataframe ratio: 8/223
Testing agent...


[1m> Entering new AgentExecutor chain...[0m




[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': "import pandas as pd\ndata = {'activity_id': ['ScenarioMIP', 'ScenarioMIP', 'ScenarioMIP', 'ScenarioMIP', 'ScenarioMIP'],\n        'experiment_id': ['ssp119', 'ssp119', 'ssp119', 'ssp119', 'ssp119'],\n        'temporal_resolution': ['3hr', '3hr', 'Amon', 'Amon', 'Amon'],\n        'variable': ['pr', 'tas', 'pr', 'ta', 'tas'],\n        'def_variable': ['precipitation flux. precipitation flux is the flux of water equivalent (rain or snow) reaching the land surface. This includes the liquid and solid phases of water',\n                         'air temperature near surface. air temperature near surface is the temperature of air at 2 meters above the surface of land, sea or inland waters',\n                         'precipitation flux. precipitation flux is the flux of water equivalent (rain or snow) reaching the land surface. This includes the liquid and solid phases of water',\n                         'air temperature. air tempera

In [3]:
@tool
def split_and_retrieve(user_input):
    """This function splits the user's question regarding retrieving data from the CMIP dataset based on variable 
    name and temporal resolution into stand alone queries. Then retrieves the corresponding data for each query for 
    a pandas dataframe."""
    print("We have decided to use the tool. Let's proceed.")
    
    llm = llm_init()
    
    prompt = get_split_prompt()
    chain = prompt | llm | StrOutputParser()
    result = chain.invoke(user_input)
    print(result)
    queries = utils.queries_to_list(result)

    pprint(queries)
    satisfied = False

    while not satisfied:
        print("Here are the individual queries that were extracted from your input:\n")
        print(result)
        yes_no = input("Would you like to proceed with these queries? (yes/no)")

        if 'yes' in yes_no.lower():
            print("Great! Let's proceed.")
            satisfied = True

        elif 'no' in yes_no.lower():
            adjustments = input("What adjustments would you like to make to the queries?")
            adjustments_prompt = get_adjustment_prompt()
            chain = adjustments_prompt | llm | StrOutputParser()
            result = chain.invoke({'initial_queries':result, 'adjustment':adjustments})
            print(result)
            
    print("We are now ready to proceed with the queries.")
    extracted_queries = utils.queries_to_list(result)
    pprint(extracted_queries)
    
    final_output = []
    for query_number, query_text in extracted_queries.items():
        print(f"Query {query_number}: {query_text}")
        final_output.append(agent.invoke(query_text))
        print("\n")
        
    print("Here are the results of your queries:")
    pprint(final_output)
    return final_output
    

In [5]:
system_prompt = (
    "You are a NASA climate scientist. Answer your colleague's questions about the CMIP6 dataset, asking for clarification if needed. "
    "You are also capable of retrieving data from the dataset. "

)
prompt = ChatPromptTemplate.from_messages(
    [
        ('system', system_prompt),
        ('human', '{input}'),
        MessagesPlaceholder("agent_scratchpad")
    ]
)

tools = [split_and_retrieve]
model = ChatOpenAI(temperature=0, model="gpt-4-turbo")
splitting_agent = create_tool_calling_agent (model, tools, prompt)
agent_executor = AgentExecutor(agent=splitting_agent, tools=tools, verbose=True)

In [8]:
result = agent_executor.invoke(
{
"input": ("what do you do?")
}
)

print(result["output"])



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mAs a NASA climate scientist, I work on analyzing climate data to understand past, present, and future climate conditions. My focus includes studying atmospheric and oceanic processes using various datasets, such as the CMIP6 dataset, to model and predict climate changes. This involves interpreting climate model outputs, comparing them with observational data, and contributing to scientific research that helps inform policy makers and the public about climate change impacts and mitigation strategies. If you have specific questions about climate data or need assistance with climate models, feel free to ask![0m

[1m> Finished chain.[0m
As a NASA climate scientist, I work on analyzing climate data to understand past, present, and future climate conditions. My focus includes studying atmospheric and oceanic processes using various datasets, such as the CMIP6 dataset, to model and predict climate changes. This involves interpret

In [9]:
result = agent_executor.invoke(
{
"input": ("Im interested in air temperature near surface at a monthly resolution and precipitation flux at a resolution of a day")
}
)

print(result["output"])



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `split_and_retrieve` with `{'user_input': 'air temperature near surface at a monthly resolution'}`


[0mWe have decided to use the tool. Let's proceed.
Original query: "air temperature near surface at a monthly resolution"

Split query 1: "air temperature near surface at a monthly resolution"
{'Query 1': 'air temperature near surface at a monthly resolution'}
Here are the individual queries that were extracted from your input:

Original query: "air temperature near surface at a monthly resolution"

Split query 1: "air temperature near surface at a monthly resolution"
Great! Let's proceed.
We are now ready to proceed with the queries.
{'Query 1': 'air temperature near surface at a monthly resolution'}
Query Query 1: air temperature near surface at a monthly resolution


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': "import pandas as pd\ndata = {'activity_id'