# Setup

In [1]:
import pandas as pd
import os

path = './paths.txt'

In [2]:
dicts = {}
with open(path, 'r') as file:
    for l in file.readlines():
        if '/v20' in l and '.html' in l:
            url = l[:-1] #remove newline char
            lsplit = url.split('/')
            url_clean = '/'.join(lsplit[:-1]) # so they go up to and including the version directory
            dicts[url_clean] = {'activity_id':lsplit[3],
                            'experiment_id':lsplit[6],
                            'temporal_resolution':lsplit[8],
                            'variable':lsplit[9]}

len(dicts)

223

In [3]:
df = pd.DataFrame.from_dict(dicts, orient='index')
df.head()

Unnamed: 0,activity_id,experiment_id,temporal_resolution,variable
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/Omon/htovovrt/gn/v20200115,ScenarioMIP,ssp119,Omon,htovovrt
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/Omon/msftbarot/gn/v20200115,ScenarioMIP,ssp119,Omon,msftbarot
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/Omon/vo/gn/v20200115,ScenarioMIP,ssp119,Omon,vo
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/Omon/sfdsi/gn/v20220812,ScenarioMIP,ssp119,Omon,sfdsi
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/Omon/sltovovrt/gn/v20200115,ScenarioMIP,ssp119,Omon,sltovovrt


In [4]:
df[(df['variable']=='ta')]

Unnamed: 0,activity_id,experiment_id,temporal_resolution,variable
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/6hrPlevPt/ta/gn/v20200115,ScenarioMIP,ssp119,6hrPlevPt,ta
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/Amon/ta/gn/v20200115,ScenarioMIP,ssp119,Amon,ta
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/6hrLev/ta/gn/v20200115,ScenarioMIP,ssp119,6hrLev,ta
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/day/ta/gn/v20200115,ScenarioMIP,ssp119,day,ta


In [5]:
docsVAR= {
    'ta': 'air temperature. air temperature is the temperature in the atmosphere. It has units of Kelvin (K). Temperature measured in kelvin can be converted to degrees Celsius (째C) by subtracting 273.15. This parameter is available on multiple levels through the atmosphere',
    'tas': 'air temperature near surface. air temperature near surface is the temperature of air at 2 meters above the surface of land, sea or inland waters',
    'pr': 'precipitation flux. precipitation flux is the flux of water equivalent (rain or snow) reaching the land surface. This includes the liquid and solid phases of water'
}
docsTR = {
    'Amon': 'monthly resolution, for atmospheric variables',
    '3hr': 'three (3) hour resolution',
    'day': 'daily resolution, measured every day',
    'AERmon': 'monthly resolution, for aerosol variables',
}
df['def_variable']             = df['variable'].apply(lambda x: docsVAR.get(x, 'UNK'))
df['def_temporal_resolution'] = df['temporal_resolution'].apply(lambda x: docsTR.get(x, 'UNK'))

In [6]:
dfsub = df[(df['def_temporal_resolution']!='UNK') & (df['def_variable']!='UNK')]
dfsub

Unnamed: 0,activity_id,experiment_id,temporal_resolution,variable,def_variable,def_temporal_resolution
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/3hr/pr/gn/v20200115,ScenarioMIP,ssp119,3hr,pr,precipitation flux. precipitation flux is the ...,three (3) hour resolution
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/3hr/tas/gn/v20200115,ScenarioMIP,ssp119,3hr,tas,air temperature near surface. air temperature ...,three (3) hour resolution
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/Amon/pr/gn/v20200115,ScenarioMIP,ssp119,Amon,pr,precipitation flux. precipitation flux is the ...,"monthly resolution, for atmospheric variables"
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/Amon/ta/gn/v20200115,ScenarioMIP,ssp119,Amon,ta,air temperature. air temperature is the temper...,"monthly resolution, for atmospheric variables"
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/Amon/tas/gn/v20200115,ScenarioMIP,ssp119,Amon,tas,air temperature near surface. air temperature ...,"monthly resolution, for atmospheric variables"
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/day/pr/gn/v20200115,ScenarioMIP,ssp119,day,pr,precipitation flux. precipitation flux is the ...,"daily resolution, measured every day"
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/day/ta/gn/v20200115,ScenarioMIP,ssp119,day,ta,air temperature. air temperature is the temper...,"daily resolution, measured every day"
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/day/tas/gn/v20200115,ScenarioMIP,ssp119,day,tas,air temperature near surface. air temperature ...,"daily resolution, measured every day"


# LangChain one-off

In [7]:
if not os.environ.get("OPENAI_API_KEY"):
    import key
    key.init()
    assert os.environ.get('OPENAI_API_KEY')

In [8]:
from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
from langchain_openai import ChatOpenAI, OpenAI

agent = create_pandas_dataframe_agent(
    ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613"),
    dfsub,
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    allow_dangerous_code=True
)

In [9]:
agent.invoke(" air temperature monthly resolution?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': "df[df['def_variable'].str.contains('air temperature') & df['def_temporal_resolution'].str.contains('monthly')]"}`


[0m[36;1m[1;3m                                                    activity_id experiment_id  \
portal.nccs.nasa.gov/datashare/giss_cmip6/Scena...  ScenarioMIP        ssp119   
portal.nccs.nasa.gov/datashare/giss_cmip6/Scena...  ScenarioMIP        ssp119   

                                                   temporal_resolution  \
portal.nccs.nasa.gov/datashare/giss_cmip6/Scena...                Amon   
portal.nccs.nasa.gov/datashare/giss_cmip6/Scena...                Amon   

                                                   variable  \
portal.nccs.nasa.gov/datashare/giss_cmip6/Scena...       ta   
portal.nccs.nasa.gov/datashare/giss_cmip6/Scena...      tas   

                                                                                         def_variable  

{'input': ' air temperature monthly resolution?',
 'output': 'The dataframe `df` contains the following information about air temperature at monthly resolution:\n\n|                                                                                                                   | activity_id   | experiment_id   | temporal_resolution   | variable   | def_variable                                                                                                                                                                                                                                                            | def_temporal_resolution                       |\n|:------------------------------------------------------------------------------------------------------------------|:--------------|:----------------|:----------------------|:-----------|:-----------------------------------------------------------------------------------------------------------------------------------------------

# LangChain conversational

In [7]:
import openai
import chromadb
import gradio as gr
import os
import json

from langchain.chains.query_constructor.base import AttributeInfo
# from langchain.retrievers.self_query.base import SelfQueryRetriever
# from langchain.chains import RetrievalQA
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_core.messages import AIMessage, HumanMessage
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_retrieval_chain, create_history_aware_retriever
from langchain.chains.combine_documents import create_stuff_documents_chain

if not os.environ.get("OPENAI_API_KEY"):
    import key
    key.init()
    assert os.environ.get('OPENAI_API_KEY')

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
conversation = {'input': 'show me URLs for that at monthly resolution', 
                'chat_history': [HumanMessage(content='what is temperature'),
                AIMessage(content='Temperature is a measure of the average kinetic energy of the particles in a substance. In the context of climate science, temperature refers to the degree of hotness or coldness of the atmosphere, land, or water. It is typically measured in Kelvin (K) and can be converted to degrees Celsius (째C) by subtracting 273.15.')
]}

## 1. Distill chat history into one standalone query
https://python.langchain.com/v0.2/docs/how_to/sequence/

In [9]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "summarize the user's request in a few key words without punctuation. "
    "Do NOT attempt to answer the user's request. "
)
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
model = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
chain_standalone = contextualize_q_prompt | model | StrOutputParser()
standalone = chain_standalone.invoke(conversation)
standalone

'show URLs monthly resolution'

## 2. Give standalone query to Pandas DataFrame agent

In [10]:
from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent

agent = create_pandas_dataframe_agent(
    ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613"),
    dfsub,
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    allow_dangerous_code=True,
    return_intermediate_steps=True
)
out = agent.invoke(standalone)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': "df[df['def_temporal_resolution'].str.contains('monthly')]['activity_id']"}`


[0m[36;1m[1;3mportal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/Amon/pr/gn/v20200115     ScenarioMIP
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/Amon/ta/gn/v20200115     ScenarioMIP
portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/Amon/tas/gn/v20200115    ScenarioMIP
Name: activity_id, dtype: object[0m[32;1m[1;3mThe URLs with monthly resolution are:

1. portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/Amon/pr/gn/v20200115
2. portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/Amon/ta/gn/v20200115
3. portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p

## 3. Pass message history and DataFrame result to conversational response LLM

In [11]:
def get_rag_system_prompt(df_ans=None):
    if df_ans is None:
        rag_system_prompt = """
            Given a chat history and the latest user query, which may reference context
            in the chat history, answer the user.
            """
    else:
        rag_system_prompt = f"""
            You have access to a dataframe `df`. Here is the output of `df.to_markdown()`: 
            {df_ans.to_markdown()}

            Given a chat history and the latest user query, which may reference context
            in the chat history, use information in DataFrame `df` to best answer the user.
        """
    return rag_system_prompt


rag_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "{prompt}"),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
chain = rag_prompt | model | StrOutputParser()

In [12]:
df_ans = out['intermediate_steps'][0][1]

rag_system_prompt = get_rag_system_prompt(df_ans)
chain.invoke(dict(conversation, **{'prompt': rag_system_prompt}))


'Based on the information in the DataFrame `df`, here are URLs for temperature data at monthly resolution:\n\n1. [portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/Amon/ta/gn/v20200115](portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/Amon/ta/gn/v20200115) - This URL provides data for temperature (ta) at monthly resolution.\n\n2. [portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/Amon/tas/gn/v20200115](portal.nccs.nasa.gov/datashare/giss_cmip6/ScenarioMIP/NASA-GISS/GISS-E2-1-G/ssp119/r1i1p1f2/Amon/tas/gn/v20200115) - This URL provides data for surface air temperature (tas) at monthly resolution.\n\nThese URLs lead to data sources where you can access temperature data at monthly resolution.'