# Splitting Multi-Queries

### Importing Packages

In [2]:
from prompts import *
import os
import key
from model import llm_init
from langchain_core.output_parsers import StrOutputParser
import re
from pprint import pprint
import utils
import pandas as pd
from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
from langchain_openai import ChatOpenAI, OpenAI
from langchain.agents import AgentExecutor

### Utility Functions

In [3]:
def queries_to_list(result):
    '''This function takes the output of the llm and returns a list of individuals queries'''
    pattern = re.compile(r'([Qq]uery|[Ss]tart query) (\d+): "([^"]+)"')

    queries = {}

    matches = pattern.findall(result)

    for match in matches:
        query_number = f'Query {match[1]}'
        query_text = match[2]
        queries[query_number] = query_text

    return queries

### Setting up LLM

In [4]:
if not os.environ.get("OPENAI_API_KEY"):
    key.init()
    assert os.environ.get('OPENAI_API_KEY')
    
llm = llm_init()

# This is for reference
test_input = "Im interested in air temperature near surface at a monthly resolution and precipitation flux at daily resolution"


### Greet the user and ask for an input

In [5]:
# initial_query = input("Hello! please make any inquiries you have about the CMIP dataset. You may make multiple inquiries at once.")
# print('You asked: ', initial_query)

### Have LLM split queries

In [6]:
#TODO: The word "query" needs to be present in output otherwise the dictionary will be empty
prompt = get_split_prompt()
chain = prompt | llm | StrOutputParser()
result = chain.invoke(test_input)
print(result)

Split queries:
Query 1: "I'm interested in air temperature near surface at a monthly resolution."
Query 2: "I'm interested in precipitation flux at daily resolution."


### Extract queries

In [7]:
queries = queries_to_list(result)

pprint(queries)

{'Query 1': "I'm interested in air temperature near surface at a monthly "
            'resolution.',
 'Query 2': "I'm interested in precipitation flux at daily resolution."}


### Confirm split queries with user

In [8]:
satisfied = False

while not satisfied:
    print("Here are the individual queries that were extracted from your input:\n")
    print(result)
    yes_no = input("Would you like to proceed with these queries? (yes/no)")

    if 'yes' in yes_no.lower():
        print("Great! Let's proceed.")
        satisfied = True

    elif 'no' in yes_no.lower():
        adjustments = input("What adjustments would you like to make to the queries?")
        adjustments_prompt = get_adjustment_prompt()
        chain = adjustments_prompt | llm | StrOutputParser()
        result = chain.invoke({'initial_queries':result, 'adjustment':adjustments})
        print(result)
        
print("We are now ready to proceed with the queries.")

Here are the individual queries that were extracted from your input:

Split queries:
Query 1: "I'm interested in air temperature near surface at a monthly resolution."
Query 2: "I'm interested in precipitation flux at daily resolution."
Here are the individual queries that were extracted from your input:

Split queries:
Query 1: "I'm interested in air temperature near surface at a monthly resolution."
Query 2: "I'm interested in precipitation flux at daily resolution."
Here are the individual queries that were extracted from your input:

Split queries:
Query 1: "I'm interested in air temperature near surface at a monthly resolution."
Query 2: "I'm interested in precipitation flux at daily resolution."
Here are the individual queries that were extracted from your input:

Split queries:
Query 1: "I'm interested in air temperature near surface at a monthly resolution."
Query 2: "I'm interested in precipitation flux at daily resolution."
Here are the individual queries that were extracted 

KeyboardInterrupt: Interrupted by user

In [8]:
extracted_queries = queries_to_list(result)
pprint(extracted_queries)

{'Query 1': "I'm interested in air temperature near surface at a monthly "
            'resolution.',
 'Query 2': "I'm interested in precipitation flux at daily resolution."}


In [9]:
dicts = utils.get_urls('./paths.txt')
df = pd.DataFrame.from_dict(dicts, orient='index')

Length of dictionary:  223


In [10]:
docsVAR = utils.read_dictionary_from_file('./docsVAR.txt')
docsTR = utils.read_dictionary_from_file('./docsTR.txt')

df['def_variable']            = df['variable'].apply(lambda x: docsVAR.get(x, 'UNK'))
df['def_temporal_resolution'] = df['temporal_resolution'].apply(lambda x: docsTR.get(x, 'UNK'))

In [11]:
dfsub = df[(df['def_temporal_resolution']!='UNK') & (df['def_variable']!='UNK')]
print(f"Subset dataframe ratio: {len(dfsub)}/{len(df)}")
dfsub

Subset dataframe ratio: 8/223


Unnamed: 0,activity_id,experiment_id,temporal_resolution,variable,def_variable,def_temporal_resolution
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/3hr/pr/gn/v20200115,ScenarioMIP,ssp119,3hr,pr,precipitation flux. precipitation flux is the ...,three (3) hour resolution
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/3hr/tas/gn/v20200115,ScenarioMIP,ssp119,3hr,tas,air temperature near surface. air temperature ...,three (3) hour resolution
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/Amon/pr/gn/v20200115,ScenarioMIP,ssp119,Amon,pr,precipitation flux. precipitation flux is the ...,"monthly resolution, for atmospheric variables"
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/Amon/ta/gn/v20200115,ScenarioMIP,ssp119,Amon,ta,air temperature. air temperature is the temper...,"monthly resolution, for atmospheric variables"
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/Amon/tas/gn/v20200115,ScenarioMIP,ssp119,Amon,tas,air temperature near surface. air temperature ...,"monthly resolution, for atmospheric variables"
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/day/pr/gn/v20200115,ScenarioMIP,ssp119,day,pr,precipitation flux. precipitation flux is the ...,"daily resolution, measured every day"
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/day/ta/gn/v20200115,ScenarioMIP,ssp119,day,ta,air temperature. air temperature is the temper...,"daily resolution, measured every day"
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/day/tas/gn/v20200115,ScenarioMIP,ssp119,day,tas,air temperature near surface. air temperature ...,"daily resolution, measured every day"


In [24]:
agent = create_pandas_dataframe_agent(
    ChatOpenAI(temperature=0, model="gpt-3.5-turbo"),
    dfsub,
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    allow_dangerous_code=True,
    handle_parsing_errors="You you believe that the data doesn't contain the information you are looking for, please try rephrasing your question.",
    return_intermediate_steps=True,
    response_format={ "type": "json_object" }
    #handle_parsing_errors= "If your output has an error, make sure strings are enclosed in double quotes, and line breaks within strings are properly escaped."
)

print("Testing agent...")
pprint(agent.invoke(" air temperature monthly resolution?"))



Testing agent...


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': "df[df['variable'].str.contains('ta') & df['temporal_resolution'].str.contains('Amon')].head()"}`


[0m[36;1m[1;3m                                                    activity_id experiment_id  \
portal.nccs.nasa.gov/datashare/giss_cmip6/Scena...  ScenarioMIP        ssp119   
portal.nccs.nasa.gov/datashare/giss_cmip6/Scena...  ScenarioMIP        ssp119   

                                                   temporal_resolution  \
portal.nccs.nasa.gov/datashare/giss_cmip6/Scena...                Amon   
portal.nccs.nasa.gov/datashare/giss_cmip6/Scena...                Amon   

                                                   variable  \
portal.nccs.nasa.gov/datashare/giss_cmip6/Scena...       ta   
portal.nccs.nasa.gov/datashare/giss_cmip6/Scena...      tas   

                                                                                         def_variable  

In [25]:
final_output = []
i = 0
for query_number, query_text in extracted_queries.items():
    if i == 0:
        i += 1
        # Skip the first query because it is the initial query
        continue
    
    print(f"Query {query_number}: {query_text}")
    final_output.append(agent.invoke(query_text))
    print("\n")
    
print("Here are the results of your queries:")
pprint(final_output)

Query Query 2: I'm interested in precipitation flux at daily resolution.


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': "df[df['temporal_resolution']=='day'][df['variable']=='pr']"}`


[0m[36;1m[1;3m                                                    activity_id experiment_id  \
portal.nccs.nasa.gov/datashare/giss_cmip6/Scena...  ScenarioMIP        ssp119   

                                                   temporal_resolution  \
portal.nccs.nasa.gov/datashare/giss_cmip6/Scena...                 day   

                                                   variable  \
portal.nccs.nasa.gov/datashare/giss_cmip6/Scena...       pr   

                                                                                         def_variable  \
portal.nccs.nasa.gov/datashare/giss_cmip6/Scena...  precipitation flux. precipitation flux is the ...   

                                                                 def_temporal_resolution 



[32;1m[1;3mThe precipitation flux data at daily resolution is available with the following details:
- **Activity ID:** ScenarioMIP
- **Experiment ID:** ssp119
- **Temporal Resolution:** Daily
- **Variable:** precipitation flux
- **Definition of Variable:** Precipitation flux is the flux of water equivalent (rain or snow) reaching the land surface. This includes the liquid and solid phases of water
- **Definition of Temporal Resolution:** Daily resolution, measured every day[0m

[1m> Finished chain.[0m


Here are the results of your queries:
[{'input': "I'm interested in precipitation flux at daily resolution.",
  'intermediate_steps': [(AgentActionMessageLog(tool='python_repl_ast', tool_input={'query': "df[df['temporal_resolution']=='day'][df['variable']=='pr']"}, log='\nInvoking: `python_repl_ast` with `{\'query\': "df[df[\'temporal_resolution\']==\'day\'][df[\'variable\']==\'pr\']"}`\n\n\n', message_log=[AIMessageChunk(content='', additional_kwargs={'function_call': {'arguments'

In [22]:
import pandas as pd

# Filter the dataframe based on the desired variable and temporal resolution
filtered_df = df[(df['variable'] == 'pr') & (df['temporal_resolution'] == '3hr')]

# Display the filtered dataframe

filtered_df

Unnamed: 0,activity_id,experiment_id,temporal_resolution,variable,def_variable,def_temporal_resolution
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/3hr/pr/gn/v20200115,ScenarioMIP,ssp119,3hr,pr,precipitation flux. precipitation flux is the ...,three (3) hour resolution


In [23]:
dfsub[(dfsub['variable'] == 'pr') & (dfsub['temporal_resolution'].str.contains('day'))]

Unnamed: 0,activity_id,experiment_id,temporal_resolution,variable,def_variable,def_temporal_resolution
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/day/pr/gn/v20200115,ScenarioMIP,ssp119,day,pr,precipitation flux. precipitation flux is the ...,"daily resolution, measured every day"
