## This code is to demo creation of a Dataframe Q&A pipeline.
Here we define the set of modules:
- Pandas prompt to infer pandas instructions from user query
- Pandas output parser to execute pandas instructions on dataframe, get back dataframe
- Response synthesis prompt to synthesize a final response given the dataframe
 


In [5]:
!pip install llama-index llama-index-experimental


Collecting llama-index-experimental
  Downloading llama_index_experimental-0.1.3-py3-none-any.whl.metadata (814 bytes)
Downloading llama_index_experimental-0.1.3-py3-none-any.whl (11 kB)
Installing collected packages: llama-index-experimental
Successfully installed llama-index-experimental-0.1.3


In [6]:
from llama_index.core.query_pipeline import (
    QueryPipeline as QP,
    Link,
    InputComponent,
)
from llama_index.llms.openai import OpenAI
from llama_index.core import PromptTemplate
import pandas as pd
from llama_index.experimental.query_engine.pandas import PandasInstructionParser

In [8]:
df = pd.read_csv("ObesityDataSet.csv", encoding='latin-1')
df

Unnamed: 0,Age,Gender,Height,Weight,Alcohol_Consulption,High_Calorie_Food,eat_vegetables,Number_of_Meals,monitor_calorie_intake,Smoke,Litres_daily_water_intake,family_history_with_overweight,Physical_activity,eat_food_between_meals,transportation_mode,Obesity_Level
0,21.000000,Female,1.620000,64.000000,no,no,2.0,3.0,no,no,2.000000,yes,1.000000,Sometimes,Public_Transportation,Normal_Weight
1,21.000000,Female,1.520000,56.000000,Sometimes,no,3.0,3.0,yes,yes,3.000000,yes,0.000000,Sometimes,Public_Transportation,Normal_Weight
2,23.000000,Male,1.800000,77.000000,Frequently,no,2.0,3.0,no,no,2.000000,yes,1.000000,Sometimes,Public_Transportation,Normal_Weight
3,27.000000,Male,1.800000,87.000000,Frequently,no,3.0,3.0,no,no,2.000000,no,0.000000,Sometimes,Walking,Overweight_Level_I
4,22.000000,Male,1.780000,89.800000,Sometimes,no,2.0,1.0,no,no,2.000000,no,0.000000,Sometimes,Public_Transportation,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,20.976842,Female,1.710730,131.408528,Sometimes,yes,3.0,3.0,no,no,1.728139,yes,0.906247,Sometimes,Public_Transportation,Obesity_Type_III
2107,21.982942,Female,1.748584,133.742943,Sometimes,yes,3.0,3.0,no,no,2.005130,yes,0.599270,Sometimes,Public_Transportation,Obesity_Type_III
2108,22.524036,Female,1.752206,133.689352,Sometimes,yes,3.0,3.0,no,no,2.054193,yes,0.646288,Sometimes,Public_Transportation,Obesity_Type_III
2109,24.361936,Female,1.739450,133.346641,Sometimes,yes,3.0,3.0,no,no,2.852339,yes,0.586035,Sometimes,Public_Transportation,Obesity_Type_III


In [9]:
instruction_str = (
    "1. Convert the query to executable Python code using Pandas.\n"
    "2. The final line of code should be a Python expression that can be called with the `eval()` function.\n"
    "3. The code should represent a solution to the query.\n"
    "4. PRINT ONLY THE EXPRESSION.\n"
    "5. Do not use quotes in the expression.\n"
)

pandas_prompt_str = (
    "You are working with a pandas dataframe in Python.\n"
    "The name of the dataframe is `df`.\n"
    "This is the result of `print(df.head())`:\n"
    "{df_str}\n\n"
    "Follow these instructions:\n"
    "{instruction_str}\n"
    "Query: {query_str}\n\n"
    "Expression:"
)
response_synthesis_prompt_str = (
    "Given an input question, synthesize a response from the query results.\n"
    "Query: {query_str}\n\n"
    "Pandas Instructions (optional):\n{pandas_instructions}\n\n"
    "Pandas Output: {pandas_output}\n\n"
    "Response: "
)

In [31]:
# Pandas prompt containing the dataframe and query instructions  
pandas_prompt = PromptTemplate(pandas_prompt_str).partial_format(
    instruction_str=instruction_str, df_str=df.head(5)
)

# Pandas output parser to execute pandas instructions on dataframe, get back dataframe
pandas_output_parser = PandasInstructionParser(df)   

# Response synthesis prompt to synthesize a final response given the dataframe and query
response_synthesis_prompt = PromptTemplate(response_synthesis_prompt_str)

# LLMS model to use
import os
from dotenv import load_dotenv
load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = api_key

llm = OpenAI(model="gpt-3.5-turbo")

In [33]:
pandas_prompt

PromptTemplate(metadata={'prompt_type': <PromptType.CUSTOM: 'custom'>}, template_vars=['df_str', 'instruction_str', 'query_str'], kwargs={'instruction_str': '1. Convert the query to executable Python code using Pandas.\n2. The final line of code should be a Python expression that can be called with the `eval()` function.\n3. The code should represent a solution to the query.\n4. PRINT ONLY THE EXPRESSION.\n5. Do not use quotes in the expression.\n', 'df_str':     Age  Gender  Height  Weight Alcohol_Consulption High_Calorie_Food  \
0  21.0  Female    1.62    64.0                  no                no   
1  21.0  Female    1.52    56.0           Sometimes                no   
2  23.0    Male    1.80    77.0          Frequently                no   
3  27.0    Male    1.80    87.0          Frequently                no   
4  22.0    Male    1.78    89.8           Sometimes                no   

   eat_vegetables  Number_of_Meals monitor_calorie_intake Smoke  \
0             2.0             

### Build Query Pipeline

In [34]:
qp = QP(
    modules={
        "input": InputComponent(),          #default input_Component recieves the query string.
        "pandas_prompt": pandas_prompt,      #Componontizing llm object twice
        "llm1": llm,
        "pandas_output_parser": pandas_output_parser,
        "response_synthesis_prompt": response_synthesis_prompt,
        "llm2": llm,
    },
    verbose=True,
)

### The Pipeline

<img src="//Users/khan/Desktop/Gen-AI/LlamaIndex-Course/Pipeline.png" style="width:1000px;height:250px;">

In [35]:
qp.add_chain(["input", "pandas_prompt", "llm1", "pandas_output_parser"])
qp.add_links(
    [
        Link("input", "response_synthesis_prompt", dest_key="query_str"),
        Link(
            "llm1", "response_synthesis_prompt", dest_key="pandas_instructions"
        ),  
        Link(
            "pandas_output_parser",
            "response_synthesis_prompt",
            dest_key="pandas_output",
        ),
    ]
)
# add link from response synthesis prompt to llm2
qp.add_link("response_synthesis_prompt", "llm2")

In [38]:
# Run Query
response = qp.run(
    query_str="What is the correlation between vegetable Intake and Obesity level? Show the correlation coefficient along with explanation."
    #query_str="What is the distrbution between males and females?",
    #query_str="What is the distribution of people consuming alcohol and not?"        
)
print(response)

[1;3;38;2;155;135;227m> Running module input with input: 
query_str: What is the correlation between vegetable Intake and Obesity level? Show the correlation coefficient along with explanation.

[0m[1;3;38;2;155;135;227m> Running module pandas_prompt with input: 
query_str: What is the correlation between vegetable Intake and Obesity level? Show the correlation coefficient along with explanation.

[0m[1;3;38;2;155;135;227m> Running module llm1 with input: 
messages: You are working with a pandas dataframe in Python.
The name of the dataframe is `df`.
This is the result of `print(df.head())`:
    Age  Gender  Height  Weight Alcohol_Consulption High_Calorie_Food  \...

[0m[1;3;38;2;155;135;227m> Running module pandas_output_parser with input: 
input: assistant: df['eat_vegetables'].corr(df['Obesity_Level'].astype('category').cat.codes)

[0m[1;3;38;2;155;135;227m> Running module response_synthesis_prompt with input: 
query_str: What is the correlation between vegetable Intake and