In [1]:
import os

import pandas as pd
from dotenv import load_dotenv
from langchain.agents import AgentExecutor
from langchain.llms import LlamaCpp
from langchain.prompts import PromptTemplate
from langchain_experimental.tools.python.tool import PythonAstREPLTool
from langchain_openai import OpenAI, ChatOpenAI

from custom_react_agent import create_react_agent
from prompt_template import llama_pandas_template

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")

In [6]:
llm = ChatOpenAI(model="gpt-4-turbo-preview", temperature=0)

In [2]:
# model_path = "models/llama-2-7b-chat.Q5_K_M.gguf"
# model_path = "models/llama-2-13b-chat.Q4_K_M.gguf"
model_path = "models/mistral-7b-instruct-v0.2.Q5_K_M.gguf"

n_gpu_layers = 25       # Change this value based on your model and your GPU VRAM pool.
n_batch = 32            # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
context_window = 4096

llm = LlamaCpp(
    model_path=model_path,
    n_ctx=context_window,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
)

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


In [49]:
df = pd.read_csv('datasets/anonymized/student.csv')

# in the shell all packages from this env are available
shell = PythonAstREPLTool(locals={"df": df})

shell.run(
    "import numpy as np; import matplotlib.pyplot as plt; import pandas as pd; import seaborn as sns"
)

tools = [shell]

prompt = PromptTemplate(
    input_variables=["agent_scratchpad", "input", "tools", "dfhead"],
    template=llama_pandas_template,
)

agent = create_react_agent(llm, tools, prompt)

agent_executor = AgentExecutor(
    agent=agent,
    tools=tools,
    verbose=True,
    handle_parsing_errors=True,
)

In [50]:
answer = agent_executor.invoke(
    {"input": "Extract the top 5 highest values from the third column."}
)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo extract the top 5 highest values from the third column, I can sort the dataframe based on the third column in descending order and then select the top 5 values.
Action: python_repl_ast
Action Input: df['att_3'].nlargest(5)
[0m[36;1m[1;3m46     22.0
302    21.0
114    20.0
166    20.0
228    20.0
Name: att_3, dtype: float64[0m[32;1m[1;3mI now know the final answer. 
Final Answer: The top 5 highest values from the third column are 22.0, 21.0, 20.0, 20.0, and 20.0.[0m

[1m> Finished chain.[0m


In [44]:
print(tools[0].run('print(X_test)'))

     att_3  att_7  att_8  att_13  att_14  att_15  att_24  att_25  att_26  \
227   17.0    2.0    3.0     2.0     1.0     0.0     3.0     3.0     3.0   
42    15.0    4.0    2.0     1.0     4.0     0.0     3.0     3.0     3.0   
256   19.0    1.0    1.0     1.0     3.0     2.0     4.0     1.0     3.0   
182   16.0    2.0    2.0     2.0     4.0     0.0     5.0     3.0     5.0   
56    16.0    1.0    3.0     1.0     2.0     3.0     4.0     3.0     5.0   
..     ...    ...    ...     ...     ...     ...     ...     ...     ...   
249   15.0    4.0    4.0     2.0     2.0     2.0     4.0     3.0     4.0   
297   18.0    3.0    2.0     2.0     1.0     1.0     2.0     5.0     3.0   
222   17.0    2.0    2.0     2.0     2.0     0.0     4.0     5.0     2.0   
351   17.0    4.0    1.0     2.0     1.0     0.0     4.0     5.0     4.0   
202   16.0    4.0    3.0     1.0     3.0     0.0     5.0     3.0     5.0   

     att_27  ...  att_18_b  att_19_a  att_19_b  att_20_a  att_20_b  att_21_a  \
227    