From 294e006ac9a35975bce896b4a525c732f7aee159 Mon Sep 17 00:00:00 2001 From: Chi Wang Date: Mon, 16 Oct 2023 16:23:27 -0700 Subject: [PATCH] Lancgain tool bridge (#262) (#263) * Added a LangChain tooling bridge example * Updated Doco * Update agentchat_langchain.ipynb remove key --------- Co-authored-by: Elliot Wood Co-authored-by: Li Jiang --- notebook/agentchat_langchain.ipynb | 740 +++++++++++++++++++++++++++++ 1 file changed, 740 insertions(+) create mode 100644 notebook/agentchat_langchain.ipynb diff --git a/notebook/agentchat_langchain.ipynb b/notebook/agentchat_langchain.ipynb new file mode 100644 index 00000000000..76afa8fcd5d --- /dev/null +++ b/notebook/agentchat_langchain.ipynb @@ -0,0 +1,740 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "ae1f50ec", + "metadata": { + "id": "ae1f50ec" + }, + "source": [ + "\"Open" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "9a71fa36", + "metadata": { + "id": "9a71fa36" + }, + "source": [ + "# Auto Generated Agent Chat: Task Solving with Langchain Provided Tools as Functions\n", + "\n", + "AutoGen offers conversable agents powered by LLM, tool or human, which can be used to perform tasks collectively via automated chat. This framwork allows tool use and human participance through multi-agent conversation. Please find documentation about this feature [here](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat).\n", + "\n", + "In this notebook, we demonstrate how to use `AssistantAgent` and `UserProxyAgent` to make function calls with the new feature of OpenAI models (in model version 0613) with a set of Langchain provided tools and toolkits, to demonstrate how to leverage the 35+ tools available. \n", + "A specified prompt and function configs need to be passed to `AssistantAgent` to initialize the agent. The corresponding functions need to be passed to `UserProxyAgent`, which will be responsible for executing any function calls made by `AssistantAgent`. Besides this requirement of matching descriptions with functions, we recommend checking the system message in the `AssistantAgent` to make sure the instructions align with the function call descriptions.\n", + "\n", + "## Requirements\n", + "\n", + "AutoGen requires `Python>=3.8`. To run this notebook example, please install the [mathchat] option since we will import functions from `MathUserProxyAgent`:\n", + "```bash\n", + "pip install \"pyautogen[mathchat]\"\n", + "```\n", + "```bash\n", + "%pip install Langchain\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2b803c17", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "2b803c17", + "outputId": "2e12aa3f-e46c-4b82-cc2e-1495f70a2961" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: pyautogen[mathchat]~=0.1.0 in /usr/local/lib/python3.10/dist-packages (0.1.4)\n", + "Requirement already satisfied: diskcache in /usr/local/lib/python3.10/dist-packages (from pyautogen[mathchat]~=0.1.0) (5.6.3)\n", + "Requirement already satisfied: flaml in /usr/local/lib/python3.10/dist-packages (from pyautogen[mathchat]~=0.1.0) (2.1.0)\n", + "Requirement already satisfied: openai in /usr/local/lib/python3.10/dist-packages (from pyautogen[mathchat]~=0.1.0) (0.28.1)\n", + "Requirement already satisfied: termcolor in /usr/local/lib/python3.10/dist-packages (from pyautogen[mathchat]~=0.1.0) (2.3.0)\n", + "Requirement already satisfied: pydantic==1.10.9 in /usr/local/lib/python3.10/dist-packages (from pyautogen[mathchat]~=0.1.0) (1.10.9)\n", + "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from pyautogen[mathchat]~=0.1.0) (1.12)\n", + "Requirement already satisfied: wolframalpha in /usr/local/lib/python3.10/dist-packages (from pyautogen[mathchat]~=0.1.0) (5.0.0)\n", + "Requirement already satisfied: typing-extensions>=4.2.0 in /usr/local/lib/python3.10/dist-packages (from pydantic==1.10.9->pyautogen[mathchat]~=0.1.0) (4.5.0)\n", + "Requirement already satisfied: NumPy>=1.17.0rc1 in /usr/local/lib/python3.10/dist-packages (from flaml->pyautogen[mathchat]~=0.1.0) (1.23.5)\n", + "Requirement already satisfied: requests>=2.20 in /usr/local/lib/python3.10/dist-packages (from openai->pyautogen[mathchat]~=0.1.0) (2.31.0)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from openai->pyautogen[mathchat]~=0.1.0) (4.66.1)\n", + "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from openai->pyautogen[mathchat]~=0.1.0) (3.8.5)\n", + "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->pyautogen[mathchat]~=0.1.0) (1.3.0)\n", + "Requirement already satisfied: xmltodict in /usr/local/lib/python3.10/dist-packages (from wolframalpha->pyautogen[mathchat]~=0.1.0) (0.13.0)\n", + "Requirement already satisfied: more-itertools in /usr/local/lib/python3.10/dist-packages (from wolframalpha->pyautogen[mathchat]~=0.1.0) (10.1.0)\n", + "Requirement already satisfied: jaraco.context in /usr/local/lib/python3.10/dist-packages (from wolframalpha->pyautogen[mathchat]~=0.1.0) (4.3.0)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.20->openai->pyautogen[mathchat]~=0.1.0) (3.2.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.20->openai->pyautogen[mathchat]~=0.1.0) (3.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.20->openai->pyautogen[mathchat]~=0.1.0) (2.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.20->openai->pyautogen[mathchat]~=0.1.0) (2023.7.22)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai->pyautogen[mathchat]~=0.1.0) (23.1.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai->pyautogen[mathchat]~=0.1.0) (6.0.4)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai->pyautogen[mathchat]~=0.1.0) (4.0.3)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai->pyautogen[mathchat]~=0.1.0) (1.9.2)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai->pyautogen[mathchat]~=0.1.0) (1.4.0)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->openai->pyautogen[mathchat]~=0.1.0) (1.3.1)\n" + ] + } + ], + "source": [ + "%pip install \"pyautogen[mathchat]~=0.1.0\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "5ebd2397", + "metadata": { + "id": "5ebd2397" + }, + "source": [ + "## Set your API Endpoint\n", + "\n", + "The [`config_list_from_models`](https://microsoft.github.io/autogen/docs/reference/oai/openai_utils#config_list_from_models) function tries to create a list of configurations using Azure OpenAI endpoints and OpenAI endpoints for the provided list of models. It assumes the api keys and api bases are stored in the corresponding environment variables or local txt files:\n", + "\n", + "- OpenAI API key: os.environ[\"OPENAI_API_KEY\"] or `openai_api_key_file=\"key_openai.txt\"`.\n", + "- Azure OpenAI API key: os.environ[\"AZURE_OPENAI_API_KEY\"] or `aoai_api_key_file=\"key_aoai.txt\"`. Multiple keys can be stored, one per line.\n", + "- Azure OpenAI API base: os.environ[\"AZURE_OPENAI_API_BASE\"] or `aoai_api_base_file=\"base_aoai.txt\"`. Multiple bases can be stored, one per line.\n", + "\n", + "It's OK to have only the OpenAI API key, or only the Azure OpenAI API key + base.\n", + "If you open this notebook in google colab, you can upload your files by click the file icon on the left panel and then choose \"upload file\" icon.\n", + "\n", + "The following code excludes Azure OpenAI endpoints from the config list because some endpoints don't support functions yet. Remove the `exclude` argument if they do." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dca301a4", + "metadata": { + "id": "dca301a4" + }, + "outputs": [], + "source": [ + "import autogen\n", + "\n", + "config_list = autogen.config_list_from_models(model_list=[\"gpt-4\", \"gpt-3.5-turbo\", \"gpt-3.5-turbo-16k\"], exclude=\"aoai\", openai_api_key_file=\"key_openai.txt\")\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "2b9526e7", + "metadata": { + "id": "2b9526e7" + }, + "source": [ + "## Making Function Calls\n", + "\n", + "In this example, we demonstrate function call execution with `AssistantAgent` and `UserProxyAgent`. With the default system prompt of `AssistantAgent`, we allow the LLM assistant to perform tasks with code, and the `UserProxyAgent` would extract code blocks from the LLM response and execute them. With the new \"function_call\" feature, we define functions and specify the description of the function in the OpenAI config for the `AssistantAgent`. Then we register the functions in `UserProxyAgent`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ngpFkWh5ad9M", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ngpFkWh5ad9M", + "outputId": "d33b51c6-5dfe-402d-dce4-6b40fca85d3d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: Langchain in /usr/local/lib/python3.10/dist-packages (0.0.305)\n", + "Requirement already satisfied: PyYAML>=5.3 in /usr/local/lib/python3.10/dist-packages (from Langchain) (6.0.1)\n", + "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /usr/local/lib/python3.10/dist-packages (from Langchain) (2.0.20)\n", + "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /usr/local/lib/python3.10/dist-packages (from Langchain) (3.8.5)\n", + "Requirement already satisfied: anyio<4.0 in /usr/local/lib/python3.10/dist-packages (from Langchain) (3.7.1)\n", + "Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in /usr/local/lib/python3.10/dist-packages (from Langchain) (4.0.3)\n", + "Requirement already satisfied: dataclasses-json<0.7,>=0.5.7 in /usr/local/lib/python3.10/dist-packages (from Langchain) (0.6.1)\n", + "Requirement already satisfied: jsonpatch<2.0,>=1.33 in /usr/local/lib/python3.10/dist-packages (from Langchain) (1.33)\n", + "Requirement already satisfied: langsmith<0.1.0,>=0.0.38 in /usr/local/lib/python3.10/dist-packages (from Langchain) (0.0.41)\n", + "Requirement already satisfied: numexpr<3.0.0,>=2.8.4 in /usr/local/lib/python3.10/dist-packages (from Langchain) (2.8.5)\n", + "Requirement already satisfied: numpy<2,>=1 in /usr/local/lib/python3.10/dist-packages (from Langchain) (1.23.5)\n", + "Requirement already satisfied: pydantic<3,>=1 in /usr/local/lib/python3.10/dist-packages (from Langchain) (1.10.9)\n", + "Requirement already satisfied: requests<3,>=2 in /usr/local/lib/python3.10/dist-packages (from Langchain) (2.31.0)\n", + "Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in /usr/local/lib/python3.10/dist-packages (from Langchain) (8.2.3)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->Langchain) (23.1.0)\n", + "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->Langchain) (3.2.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->Langchain) (6.0.4)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->Langchain) (1.9.2)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->Langchain) (1.4.0)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0.0,>=3.8.3->Langchain) (1.3.1)\n", + "Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.10/dist-packages (from anyio<4.0->Langchain) (3.4)\n", + "Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.10/dist-packages (from anyio<4.0->Langchain) (1.3.0)\n", + "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<4.0->Langchain) (1.1.3)\n", + "Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in /usr/local/lib/python3.10/dist-packages (from dataclasses-json<0.7,>=0.5.7->Langchain) (3.20.1)\n", + "Requirement already satisfied: typing-inspect<1,>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from dataclasses-json<0.7,>=0.5.7->Langchain) (0.9.0)\n", + "Requirement already satisfied: jsonpointer>=1.9 in /usr/local/lib/python3.10/dist-packages (from jsonpatch<2.0,>=1.33->Langchain) (2.4)\n", + "Requirement already satisfied: typing-extensions>=4.2.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1->Langchain) (4.5.0)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->Langchain) (2.0.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2->Langchain) (2023.7.22)\n", + "Requirement already satisfied: greenlet!=0.4.17 in /usr/local/lib/python3.10/dist-packages (from SQLAlchemy<3,>=1.4->Langchain) (2.0.2)\n", + "Requirement already satisfied: packaging>=17.0 in /usr/local/lib/python3.10/dist-packages (from marshmallow<4.0.0,>=3.18.0->dataclasses-json<0.7,>=0.5.7->Langchain) (23.1)\n", + "Requirement already satisfied: mypy-extensions>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->Langchain) (1.0.0)\n" + ] + } + ], + "source": [ + "%pip install Langchain" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "qCzNbbVajvpc", + "metadata": { + "id": "qCzNbbVajvpc" + }, + "outputs": [], + "source": [ + "# Import things that are needed generically\n", + "from langchain.pydantic_v1 import BaseModel, Field\n", + "from langchain.tools import BaseTool\n", + "from typing import Optional, Type\n", + "import math\n", + "\n", + "class CircumferenceToolInput(BaseModel):\n", + " radius: float = Field()\n", + "\n", + "class CircumferenceTool(BaseTool):\n", + " name = \"circumference_calculator\"\n", + " description = \"Use this tool when you need to calculate a circumference using the radius of a circle\"\n", + " args_schema: Type[BaseModel] = CircumferenceToolInput\n", + "\n", + " def _run(self, radius: float):\n", + " return float(radius) * 2.0 * math.pi" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "COlL5_98atDs", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "COlL5_98atDs", + "outputId": "24ce236d-8993-4a69-99e2-65453574d61e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "user_proxy (to chatbot):\n", + "\n", + "Read the file with the path 'Test.txt', then calculate the circumference of a circle that has a radius of that files contents.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "chatbot (to user_proxy):\n", + "\n", + "***** Suggested function Call: read_file *****\n", + "Arguments: \n", + "{\n", + " \"file_path\": \"Test.txt\"\n", + "}\n", + "**********************************************\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\n", + ">>>>>>>> EXECUTING FUNCTION read_file...\n", + "user_proxy (to chatbot):\n", + "\n", + "***** Response from calling function \"read_file\" *****\n", + "7.81mm\n", + "******************************************************\n", + "\n", + "--------------------------------------------------------------------------------\n", + "chatbot (to user_proxy):\n", + "\n", + "***** Suggested function Call: circumference_calculator *****\n", + "Arguments: \n", + "{\n", + " \"radius\": 7.81\n", + "}\n", + "*************************************************************\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\n", + ">>>>>>>> EXECUTING FUNCTION circumference_calculator...\n", + "user_proxy (to chatbot):\n", + "\n", + "***** Response from calling function \"circumference_calculator\" *****\n", + "49.071677249072565\n", + "*********************************************************************\n", + "\n", + "--------------------------------------------------------------------------------\n", + "chatbot (to user_proxy):\n", + "\n", + "The circumference of a circle with a radius of 7.81mm is approximately 49.071mm.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "user_proxy (to chatbot):\n", + "\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "chatbot (to user_proxy):\n", + "\n", + "TERMINATE\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "from langchain.tools.file_management.read import ReadFileTool\n", + "\n", + "# Define a function to generate llm_config from a LangChain tool\n", + "def generate_llm_config(tool):\n", + " # Define the function schema based on the tool's args_schema\n", + " function_schema = {\n", + " \"name\": tool.name.lower().replace (' ', '_'),\n", + " \"description\": tool.description,\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {},\n", + " \"required\": [],\n", + " },\n", + " }\n", + "\n", + " if tool.args is not None:\n", + " function_schema[\"parameters\"][\"properties\"] = tool.args\n", + "\n", + " return function_schema\n", + "\n", + "# Instantiate the ReadFileTool\n", + "read_file_tool = ReadFileTool()\n", + "custom_tool = CircumferenceTool()\n", + "\n", + "# Construct the llm_config\n", + "llm_config = {\n", + " #Generate functions config for the Tool\n", + " \"functions\":[\n", + " generate_llm_config(custom_tool),\n", + " generate_llm_config(read_file_tool),\n", + " ],\n", + " \"config_list\": config_list, # Assuming you have this defined elsewhere\n", + " \"request_timeout\": 120,\n", + "}\n", + "\n", + "user_proxy = autogen.UserProxyAgent(\n", + " name=\"user_proxy\",\n", + " is_termination_msg=lambda x: x.get(\"content\", \"\") and x.get(\"content\", \"\").rstrip().endswith(\"TERMINATE\"),\n", + " human_input_mode=\"NEVER\",\n", + " max_consecutive_auto_reply=10,\n", + " code_execution_config={\"work_dir\": \"coding\"},\n", + ")\n", + "\n", + "# Register the tool and start the conversation\n", + "user_proxy.register_function(\n", + " function_map={\n", + " custom_tool.name: custom_tool._run,\n", + " read_file_tool.name: read_file_tool._run,\n", + " }\n", + ")\n", + "\n", + "chatbot = autogen.AssistantAgent(\n", + " name=\"chatbot\",\n", + " system_message=\"For coding tasks, only use the functions you have been provided with. Reply TERMINATE when the task is done.\",\n", + " llm_config=llm_config,\n", + ")\n", + "\n", + "user_proxy.initiate_chat(\n", + " chatbot,\n", + " message=\"Read the file with the path 'Test.txt', then calculate the circumference of a circle that has a radius of that files contents.\", #\" 7.81mm\" in the file\n", + " llm_config=llm_config,\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "11cc4e60", + "metadata": {}, + "source": [ + "# A PySpark Examle" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "Y-ozf9EFCegw", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Y-ozf9EFCegw", + "outputId": "db7b73a8-6129-4dfb-9d5c-ac3536f310d7" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: pyspark in /usr/local/lib/python3.10/dist-packages (3.5.0)\n", + "Requirement already satisfied: py4j==0.10.9.7 in /usr/local/lib/python3.10/dist-packages (from pyspark) (0.10.9.7)\n" + ] + } + ], + "source": [ + "%pip install pyspark" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7iFp-Sm4CYq_", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7iFp-Sm4CYq_", + "outputId": "2e1a2a70-53e6-4896-9232-63db6d097d63" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+\n", + "|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|\n", + "+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+\n", + "| -114.31| 34.19| 15.0| 5612.0| 1283.0| 1015.0| 472.0| 1.4936| 66900.0|\n", + "| -114.47| 34.4| 19.0| 7650.0| 1901.0| 1129.0| 463.0| 1.82| 80100.0|\n", + "| -114.56| 33.69| 17.0| 720.0| 174.0| 333.0| 117.0| 1.6509| 85700.0|\n", + "| -114.57| 33.64| 14.0| 1501.0| 337.0| 515.0| 226.0| 3.1917| 73400.0|\n", + "| -114.57| 33.57| 20.0| 1454.0| 326.0| 624.0| 262.0| 1.925| 65500.0|\n", + "| -114.58| 33.63| 29.0| 1387.0| 236.0| 671.0| 239.0| 3.3438| 74000.0|\n", + "| -114.58| 33.61| 25.0| 2907.0| 680.0| 1841.0| 633.0| 2.6768| 82400.0|\n", + "| -114.59| 34.83| 41.0| 812.0| 168.0| 375.0| 158.0| 1.7083| 48500.0|\n", + "| -114.59| 33.61| 34.0| 4789.0| 1175.0| 3134.0| 1056.0| 2.1782| 58400.0|\n", + "| -114.6| 34.83| 46.0| 1497.0| 309.0| 787.0| 271.0| 2.1908| 48100.0|\n", + "| -114.6| 33.62| 16.0| 3741.0| 801.0| 2434.0| 824.0| 2.6797| 86500.0|\n", + "| -114.6| 33.6| 21.0| 1988.0| 483.0| 1182.0| 437.0| 1.625| 62000.0|\n", + "| -114.61| 34.84| 48.0| 1291.0| 248.0| 580.0| 211.0| 2.1571| 48600.0|\n", + "| -114.61| 34.83| 31.0| 2478.0| 464.0| 1346.0| 479.0| 3.212| 70400.0|\n", + "| -114.63| 32.76| 15.0| 1448.0| 378.0| 949.0| 300.0| 0.8585| 45000.0|\n", + "| -114.65| 34.89| 17.0| 2556.0| 587.0| 1005.0| 401.0| 1.6991| 69100.0|\n", + "| -114.65| 33.6| 28.0| 1678.0| 322.0| 666.0| 256.0| 2.9653| 94900.0|\n", + "| -114.65| 32.79| 21.0| 44.0| 33.0| 64.0| 27.0| 0.8571| 25000.0|\n", + "| -114.66| 32.74| 17.0| 1388.0| 386.0| 775.0| 320.0| 1.2049| 44000.0|\n", + "| -114.67| 33.92| 17.0| 97.0| 24.0| 29.0| 15.0| 1.2656| 27500.0|\n", + "+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "source": [ + "#Starndard Langchain example\n", + "from langchain.agents import create_spark_sql_agent\n", + "from langchain.agents.agent_toolkits import SparkSQLToolkit\n", + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.utilities.spark_sql import SparkSQL\n", + "from pyspark.sql import SparkSession\n", + "\n", + "spark = SparkSession.builder.getOrCreate()\n", + "schema = \"langchain_example\"\n", + "spark.sql(f\"CREATE DATABASE IF NOT EXISTS {schema}\")\n", + "spark.sql(f\"USE {schema}\")\n", + "csv_file_path = \"./sample_data/california_housing_train.csv\"\n", + "table = \"california_housing_train\"\n", + "spark.read.csv(csv_file_path, header=True, inferSchema=True).write.option(\"path\", \"file:/content/spark-warehouse/langchain_example.db/california_housing_train\").mode(\"overwrite\").saveAsTable(table)\n", + "spark.table(table).show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "iLtSTHoJD7Jn", + "metadata": { + "id": "iLtSTHoJD7Jn" + }, + "outputs": [], + "source": [ + "# Note, you can also connect to Spark via Spark connect. For example:\n", + "# db = SparkSQL.from_uri(\"sc://localhost:15002\", schema=schema)\n", + "spark_sql = SparkSQL(schema=schema)\n", + "llm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-16k\")\n", + "toolkit = SparkSQLToolkit(db=spark_sql, llm=llm)\n", + "agent_executor = create_spark_sql_agent(llm=llm, toolkit=toolkit, verbose=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "VzqNYlVjCqQa", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 608 + }, + "id": "VzqNYlVjCqQa", + "outputId": "dd4de772-7b0c-4650-d106-c83d4593158e" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n", + "\u001b[32;1m\u001b[1;3mAction: list_tables_sql_db\n", + "Action Input: \"\"\u001b[0m\n", + "Observation: \u001b[38;5;200m\u001b[1;3mcalifornia_housing_train\u001b[0m\n", + "Thought:\u001b[32;1m\u001b[1;3mI see that there is a table called \"california_housing_train\" in the database. I can use the \"schema_sql_db\" tool to get more information about this table.\n", + "Action: schema_sql_db\n", + "Action Input: \"california_housing_train\"\u001b[0m\n", + "Observation: \u001b[33;1m\u001b[1;3mCREATE TABLE spark_catalog.langchain_example.california_housing_train (\n", + " longitude DOUBLE,\n", + " latitude DOUBLE,\n", + " housing_median_age DOUBLE,\n", + " total_rooms DOUBLE,\n", + " total_bedrooms DOUBLE,\n", + " population DOUBLE,\n", + " households DOUBLE,\n", + " median_income DOUBLE,\n", + " median_house_value DOUBLE)\n", + ";\n", + "\n", + "/*\n", + "3 rows from california_housing_train table:\n", + "longitude\tlatitude\thousing_median_age\ttotal_rooms\ttotal_bedrooms\tpopulation\thouseholds\tmedian_income\tmedian_house_value\n", + "-114.31\t34.19\t15.0\t5612.0\t1283.0\t1015.0\t472.0\t1.4936\t66900.0\n", + "-114.47\t34.4\t19.0\t7650.0\t1901.0\t1129.0\t463.0\t1.82\t80100.0\n", + "-114.56\t33.69\t17.0\t720.0\t174.0\t333.0\t117.0\t1.6509\t85700.0\n", + "*/\u001b[0m\n", + "Thought:\u001b[32;1m\u001b[1;3mThe \"california_housing_train\" table has the following columns: longitude, latitude, housing_median_age, total_rooms, total_bedrooms, population, households, median_income, and median_house_value. It contains information about housing in California. I can now answer the question.\n", + "Action: None\n", + "Final Answer: The \"california_housing_train\" table has the following columns: longitude, latitude, housing_median_age, total_rooms, total_bedrooms, population, households, median_income, and median_house_value.\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] + }, + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'The \"california_housing_train\" table has the following columns: longitude, latitude, housing_median_age, total_rooms, total_bedrooms, population, households, median_income, and median_house_value.'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Starndard Langchain example\n", + "agent_executor.run(\"Describe the california_housing_train table\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d94d45a9", + "metadata": {}, + "outputs": [], + "source": [ + "# LangChain direct tool usage instead of toolkit example\n", + "# from langchain.tools.spark_sql.tool import (\n", + "# InfoSparkSQLTool,\n", + "# ListSparkSQLTool,\n", + "# QueryCheckerTool,\n", + "# QuerySparkSQLTool,\n", + "# )\n", + "# debug_toolkit = [\n", + "# QuerySparkSQLTool(db=spark_sql),\n", + "# InfoSparkSQLTool(db=spark_sql),\n", + "# ListSparkSQLTool(db=spark_sql),\n", + "# QueryCheckerTool(db=spark_sql, llm=llm),\n", + "#]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "r7PFvDS7Ev-E", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "r7PFvDS7Ev-E", + "outputId": "53d9c45d-058e-4e37-ba73-556591aaab42" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'name': 'query_sql_db', 'description': '\\n Input to this tool is a detailed and correct SQL query, output is a result from the Spark SQL.\\n If the query is not correct, an error message will be returned.\\n If an error is returned, rewrite the query, check the query, and try again.\\n ', 'parameters': {'type': 'object', 'properties': {'query': {'title': 'Query', 'type': 'string'}}, 'required': []}}\n", + "{'name': 'schema_sql_db', 'description': '\\n Input to this tool is a comma-separated list of tables, output is the schema and sample rows for those tables.\\n Be sure that the tables actually exist by calling list_tables_sql_db first!\\n\\n Example Input: \"table1, table2, table3\"\\n ', 'parameters': {'type': 'object', 'properties': {'table_names': {'title': 'Table Names', 'type': 'string'}}, 'required': []}}\n", + "{'name': 'list_tables_sql_db', 'description': 'Input is an empty string, output is a comma separated list of tables in the Spark SQL.', 'parameters': {'type': 'object', 'properties': {'tool_input': {'title': 'Tool Input', 'default': '', 'type': 'string'}}, 'required': []}}\n", + "{'name': 'query_checker_sql_db', 'description': '\\n Use this tool to double check if your query is correct before executing it.\\n Always use this tool before executing a query with query_sql_db!\\n ', 'parameters': {'type': 'object', 'properties': {'query': {'title': 'Query', 'type': 'string'}}, 'required': []}}\n", + "user_proxy (to chatbot):\n", + "\n", + "Describe the table names california_housing_train\n", + "\n", + "--------------------------------------------------------------------------------\n", + "chatbot (to user_proxy):\n", + "\n", + "***** Suggested function Call: schema_sql_db *****\n", + "Arguments: \n", + "{\n", + " \"table_names\": \"california_housing_train\"\n", + "}\n", + "**************************************************\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\n", + ">>>>>>>> EXECUTING FUNCTION schema_sql_db...\n", + "user_proxy (to chatbot):\n", + "\n", + "***** Response from calling function \"schema_sql_db\" *****\n", + "CREATE TABLE spark_catalog.langchain_example.california_housing_train (\n", + " longitude DOUBLE,\n", + " latitude DOUBLE,\n", + " housing_median_age DOUBLE,\n", + " total_rooms DOUBLE,\n", + " total_bedrooms DOUBLE,\n", + " population DOUBLE,\n", + " households DOUBLE,\n", + " median_income DOUBLE,\n", + " median_house_value DOUBLE)\n", + ";\n", + "\n", + "/*\n", + "3 rows from california_housing_train table:\n", + "longitude\tlatitude\thousing_median_age\ttotal_rooms\ttotal_bedrooms\tpopulation\thouseholds\tmedian_income\tmedian_house_value\n", + "-114.31\t34.19\t15.0\t5612.0\t1283.0\t1015.0\t472.0\t1.4936\t66900.0\n", + "-114.47\t34.4\t19.0\t7650.0\t1901.0\t1129.0\t463.0\t1.82\t80100.0\n", + "-114.56\t33.69\t17.0\t720.0\t174.0\t333.0\t117.0\t1.6509\t85700.0\n", + "*/\n", + "**********************************************************\n", + "\n", + "--------------------------------------------------------------------------------\n", + "chatbot (to user_proxy):\n", + "\n", + "The california_housing_train table has the following schema:\n", + "\n", + "- longitude: DOUBLE\n", + "- latitude: DOUBLE\n", + "- housing_median_age: DOUBLE\n", + "- total_rooms: DOUBLE\n", + "- total_bedrooms: DOUBLE\n", + "- population: DOUBLE\n", + "- households: DOUBLE\n", + "- median_income: DOUBLE\n", + "- median_house_value: DOUBLE\n", + "\n", + "Here are sample rows from the table:\n", + "\n", + "| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value |\n", + "|-----------|----------|--------------------|-------------|----------------|------------|------------|---------------|--------------------|\n", + "| -114.31 | 34.19 | 15.0 | 5612.0 | 1283.0 | 1015.0 | 472.0 | 1.4936 | 66900.0 |\n", + "| -114.47 | 34.4 | 19.0 | 7650.0 | 1901.0 | 1129.0 | 463.0 | 1.82 | 80100.0 |\n", + "| -114.56 | 33.69 | 17.0 | 720.0 | 174.0 | 333.0 | 117.0 | 1.6509 | 85700.0 |\n", + "\n", + "--------------------------------------------------------------------------------\n", + "user_proxy (to chatbot):\n", + "\n", + "\n", + "\n", + "--------------------------------------------------------------------------------\n", + "chatbot (to user_proxy):\n", + "\n", + "TERMINATE\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "\n", + "# Now use AutoGen with Langchain Tool Bridgre\n", + "tools = []\n", + "function_map = {}\n", + "\n", + "for tool in toolkit.get_tools(): #debug_toolkit if you want to use tools directly\n", + " tool_schema = generate_llm_config(tool)\n", + " print(tool_schema)\n", + " tools.append(tool_schema)\n", + " function_map[tool.name] = tool._run\n", + "\n", + "# Construct the llm_config\n", + "llm_config = {\n", + " \"functions\": tools,\n", + " \"config_list\": config_list, # Assuming you have this defined elsewhere\n", + " \"request_timeout\": 120,\n", + "}\n", + "\n", + "user_proxy = autogen.UserProxyAgent(\n", + " name=\"user_proxy\",\n", + " is_termination_msg=lambda x: x.get(\"content\", \"\") and x.get(\"content\", \"\").rstrip().endswith(\"TERMINATE\"),\n", + " human_input_mode=\"NEVER\",\n", + " max_consecutive_auto_reply=10,\n", + " code_execution_config={\"work_dir\": \"coding\"},\n", + ")\n", + "\n", + "print(function_map)\n", + "\n", + "# Register the tool and start the conversation\n", + "user_proxy.register_function(\n", + " function_map = function_map\n", + ")\n", + "\n", + "chatbot = autogen.AssistantAgent(\n", + " name=\"chatbot\",\n", + " system_message=\"For coding tasks, only use the functions you have been provided with. Reply TERMINATE when the task is done.\",\n", + " llm_config=llm_config,\n", + ")\n", + "\n", + "user_proxy.initiate_chat(\n", + " chatbot,\n", + " message=\"Describe the table names california_housing_train\",\n", + " llm_config=llm_config,\n", + ")" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "flaml_dev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}