In [1]:
import os

import pandas as pd
from dotenv import load_dotenv
from langchain.agents import AgentExecutor
from langchain.llms import LlamaCpp
from langchain.prompts import PromptTemplate
from langchain_experimental.tools.python.tool import PythonAstREPLTool
from langchain_openai import OpenAI

from custom_react_agent import create_react_agent
from prompt_template import llama_pandas_template

load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")

In [6]:
llm = OpenAI(model="gpt-3.5-turbo-instruct", temperature=0)

In [3]:
# model_path = "models/llama-2-7b-chat.Q5_K_M.gguf"
# model_path = "models/llama-2-13b-chat.Q4_K_M.gguf"
model_path = "models/mistral-7b-instruct-v0.2.Q5_K_M.gguf"

n_gpu_layers = 25       # Change this value based on your model and your GPU VRAM pool.
n_batch = 32            # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
context_window = 4096

llm = LlamaCpp(
    model_path=model_path,
    n_ctx=context_window,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
)

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


In [7]:
df = pd.read_csv('datasets/anonymized/student.csv')

# in the shell all packages from this env are available
shell = PythonAstREPLTool(locals={"df": df})

shell.run(
    "import numpy as np; import matplotlib.pyplot as plt; import pandas as pd; import seaborn as sns"
)

tools = [shell]

prompt = PromptTemplate(
    input_variables=["agent_scratchpad", "input", "tools", "dfhead"],
    template=llama_pandas_template,
)

agent = create_react_agent(llm, tools, prompt)

agent_executor = AgentExecutor(
    agent=agent,
    tools=tools,
    verbose=True,
    handle_parsing_errors=True,
)

In [8]:
answer = agent_executor.invoke(
    {"input": "Which columns have missing data and what is the percentage of missing data in those columns?"}
)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m I can use the isnull() and sum() functions to check for missing data in each column and then calculate the percentage of missing data.
Action: python_repl_ast
Action Input: df.isnull().sum() / len(df) * 100
[0m[36;1m[1;3matt_1     5.617978
att_2     2.247191
att_3     1.966292
att_4     8.707865
att_5     2.528090
att_6     8.988764
att_7     2.247191
att_8     2.528090
att_9     2.247191
att_10    3.651685
att_11    7.584270
att_12    3.932584
att_13    6.179775
att_14    5.056180
att_15    9.269663
att_16    2.247191
att_17    5.617978
att_18    6.741573
att_19    1.404494
att_20    5.337079
att_21    9.831461
att_22    6.460674
att_23    1.685393
att_24    2.528090
att_25    5.337079
att_26    5.056180
att_27    2.247191
att_28    1.123596
att_29    1.404494
att_30    7.865169
att_31    2.808989
att_32    3.651685
att_33    7.022472
dtype: float64[0m[32;1m[1;3m I now know the final answer
Final Answer: The columns w