# Splitting Multi-Queries

### Importing Packages

In [1]:
from prompts import *
import os
import key
from model import llm_init
from langchain_core.output_parsers import StrOutputParser
import re
from pprint import pprint
import utils
import pandas as pd
from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
from langchain_openai import ChatOpenAI, OpenAI
from langchain.agents import AgentExecutor

### Utility Functions

In [2]:
def queries_to_list(result):
    '''This function takes the output of the llm and returns a list of individuals queries'''
    pattern = re.compile(r'([Qq]uery|[Ss]tart query) (\d+): "([^"]+)"')

    queries = {}

    matches = pattern.findall(result)

    for match in matches:
        query_number = f'Query {match[1]}'
        query_text = match[2]
        queries[query_number] = query_text

    return queries

### Setting up LLM

In [8]:
if not os.environ.get("OPENAI_API_KEY"):
    key.init()
    assert os.environ.get('OPENAI_API_KEY')
    
llm = llm_init()

# This is for reference
test_input = "Im interested in air temperature near surface at a monthly resolution and precipitation flux at a resolution of a day"


### Greet the user and ask for an input

In [9]:
# initial_query = input("Hello! please make any inquiries you have about the CMIP dataset. You may make multiple inquiries at once.")
# print('You asked: ', initial_query)

### Have LLM split queries

In [10]:
#TODO: The word "query" needs to be present in output otherwise the dictionary will be empty
prompt = get_split_prompt()
chain = prompt | llm | StrOutputParser()
result = chain.invoke(test_input)
print(result)

Split query 1: "I'm interested in air temperature near surface at a monthly resolution."
Split query 2: "I'm interested in precipitation flux at a resolution of a day."


### Extract queries

In [11]:
queries = queries_to_list(result)

pprint(queries)

{'Query 1': "I'm interested in air temperature near surface at a monthly "
            'resolution.',
 'Query 2': "I'm interested in precipitation flux at a resolution of a day."}


### Confirm split queries with user

In [12]:
satisfied = False

while not satisfied:
    print("Here are the individual queries that were extracted from your input:\n")
    print(result)
    yes_no = input("Would you like to proceed with these queries? (yes/no)")

    if 'yes' in yes_no.lower():
        print("Great! Let's proceed.")
        satisfied = True

    elif 'no' in yes_no.lower():
        adjustments = input("What adjustments would you like to make to the queries?")
        adjustments_prompt = get_adjustment_prompt()
        chain = adjustments_prompt | llm | StrOutputParser()
        result = chain.invoke({'initial_queries':result, 'adjustment':adjustments})
        print(result)
        
print("We are now ready to proceed with the queries.")

Here are the individual queries that were extracted from your input:

Split query 1: "I'm interested in air temperature near surface at a monthly resolution."
Split query 2: "I'm interested in precipitation flux at a resolution of a day."
Great! Let's proceed.
We are now ready to proceed with the queries.


In [13]:
extracted_queries = queries_to_list(result)
pprint(extracted_queries)

{'Query 1': "I'm interested in air temperature near surface at a monthly "
            'resolution.',
 'Query 2': "I'm interested in precipitation flux at a resolution of a day."}


In [14]:
dicts = utils.get_urls('./paths.txt')
df = pd.DataFrame.from_dict(dicts, orient='index')

Length of dictionary:  223


In [15]:
docsVAR = utils.read_dictionary_from_file('./docsVAR.txt')
docsTR = utils.read_dictionary_from_file('./docsTR.txt')

df['def_variable']            = df['variable'].apply(lambda x: docsVAR.get(x, 'UNK'))
df['def_temporal_resolution'] = df['temporal_resolution'].apply(lambda x: docsTR.get(x, 'UNK'))

In [16]:
dfsub = df[(df['def_temporal_resolution']!='UNK') & (df['def_variable']!='UNK')]
print(f"Subset dataframe ratio: {len(dfsub)}/{len(df)}")
dfsub

Subset dataframe ratio: 8/223


Unnamed: 0,activity_id,experiment_id,temporal_resolution,variable,def_variable,def_temporal_resolution
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/3hr/pr/gn/v20200115,ScenarioMIP,ssp119,3hr,pr,precipitation flux. precipitation flux is the ...,three (3) hour resolution
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/3hr/tas/gn/v20200115,ScenarioMIP,ssp119,3hr,tas,air temperature near surface. air temperature ...,three (3) hour resolution
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/Amon/pr/gn/v20200115,ScenarioMIP,ssp119,Amon,pr,precipitation flux. precipitation flux is the ...,"monthly resolution, for atmospheric variables"
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/Amon/ta/gn/v20200115,ScenarioMIP,ssp119,Amon,ta,air temperature. air temperature is the temper...,"monthly resolution, for atmospheric variables"
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/Amon/tas/gn/v20200115,ScenarioMIP,ssp119,Amon,tas,air temperature near surface. air temperature ...,"monthly resolution, for atmospheric variables"
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/day/pr/gn/v20200115,ScenarioMIP,ssp119,day,pr,precipitation flux. precipitation flux is the ...,"daily resolution, measured every day"
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/day/ta/gn/v20200115,ScenarioMIP,ssp119,day,ta,air temperature. air temperature is the temper...,"daily resolution, measured every day"
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/day/tas/gn/v20200115,ScenarioMIP,ssp119,day,tas,air temperature near surface. air temperature ...,"daily resolution, measured every day"


In [21]:
agent = create_pandas_dataframe_agent(
    ChatOpenAI(temperature=0, model="gpt-4-turbo"),
    dfsub,
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    allow_dangerous_code=True,
    handle_parsing_errors="You you believe that the data doesn't contain the information you are looking for, please try rephrasing your question.",
    return_intermediate_steps=True,
    #handle_parsing_errors= "If your output has an error, make sure strings are enclosed in double quotes, and line breaks within strings are properly escaped."
)

print("Testing agent...")
pprint(agent.invoke(" air temperature monthly resolution?"))



Testing agent...


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': "import pandas as pd\ndata = {'': ['portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/3hr/pr/gn/v20200115', 'portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/3hr/tas/gn/v20200115', 'portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/Amon/pr/gn/v20200115', 'portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/Amon/ta/gn/v20200115', 'portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/Amon/tas/gn/v20200115'], 'activity_id': ['ScenarioMIP', 'ScenarioMIP', 'ScenarioMIP', 'ScenarioMIP', 'ScenarioMIP'], 'experiment_id': ['ssp119', 'ssp119', 'ssp119', 'ssp119', 'ssp119'], 'temporal_resolution': ['3hr', '3hr', 'Amon', 'Amon', 'Amon'], 'variable': ['pr', 'tas', 'pr', 'ta',

In [22]:
final_output = []
for query_number, query_text in extracted_queries.items():
    print(f"Query {query_number}: {query_text}")
    final_output.append(agent.invoke(query_text))
    print("\n")
    
print("Here are the results of your queries:")
pprint(final_output)

Query Query 1: I'm interested in air temperature near surface at a monthly resolution.


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': "df[(df['variable'] == 'tas') & (df['temporal_resolution'] == 'Amon')]"}`
responded: To find the data related to air temperature near the surface at a monthly resolution, we can filter the dataframe `df` based on the `variable` column for "tas" (which stands for air temperature near surface) and the `temporal_resolution` column for "Amon" (which indicates monthly resolution). Let's perform this filtering to retrieve the relevant rows.

[0m[36;1m[1;3m                                                      activity_id  \
4  portal.nccs.nasa.gov/datashare/giss_cmip6/Scen...  ScenarioMIP   

  experiment_id temporal_resolution variable  \
4        ssp119                Amon      tas   

                                        def_variable  \
4  air temperature near surface. air temperature ...   

 

In [23]:
import pandas as pd

# Filter the dataframe based on the desired variable and temporal resolution
filtered_df = df[(df['variable'] == 'pr') & (df['temporal_resolution'] == 'day')]

# Display the filtered dataframe

filtered_df

Unnamed: 0,activity_id,experiment_id,temporal_resolution,variable,def_variable,def_temporal_resolution
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/day/pr/gn/v20200115,ScenarioMIP,ssp119,day,pr,precipitation flux. precipitation flux is the ...,"daily resolution, measured every day"


In [25]:
dfsub[(dfsub['variable'] == 'pr') & (dfsub['temporal_resolution'].str.contains('day'))]

Unnamed: 0,activity_id,experiment_id,temporal_resolution,variable,def_variable,def_temporal_resolution
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/day/pr/gn/v20200115,ScenarioMIP,ssp119,day,pr,precipitation flux. precipitation flux is the ...,"daily resolution, measured every day"
