In [1]:
pip install -qU "databricks-sdk" "langchain-core" "langgraph>0.3"

Note: you may need to restart the kernel to use updated packages.


In [None]:
import os

# trick the kernel into restarting

os._exit(00)

## Install the Langchain Foundational Model Provider

In [1]:
pip install -qU "langchain[anthropic]"

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import getpass

In [3]:
# set the environment variables
os.environ['LANGSMITH_TRACING'] = "true"
os.environ['LANGSMITH_API_KEY'] = getpass.getpass()

 ········


In [None]:
#os.environ.get('LANGSMITH_TRACING')
#os.environ.get('LANGSMITH_API_KEY')

## Provide Bindings to the Unity Catalog API
> The following examples utilize the official databricks sdk and the non-OSS Unity Catalog (UC)
> The catalog, schema, and table apis can be mostly used interchangably. You will just have more metadata to access using the official Databricks UC.

In [4]:
# this is the full host: https://{workspace-name}.cloud.databricks.com
os.environ['DATABRICKS_HOST'] = getpass.getpass()

 ········


In [None]:
#os.environ.get('DATABRICKS_HOST')

In [6]:
os.environ['DATABRICKS_LLM_ENDPOINT'] = 'databricks-llama-4-maverick'
os.environ['DATABRICKS_TOKEN'] = getpass.getpass()

 ········


## Connect to the Databrick's Workspace Client

In [8]:
from databricks.sdk import WorkspaceClient

host = os.environ.get('DATABRICKS_HOST')
token = os.environ.get('DATABRICKS_TOKEN')

workspace_client = WorkspaceClient(host=host, token=token)

In [None]:
# Check that you can connect and that you are bound to the right groups to view metadata and execute 

# workspace_client.current_user.me()

## Provide your Anthropic API Key and Environment Variables

In [10]:
if not os.environ.get('ANTHROPIC_API_KEY'):
    os.environ['ANTHROPIC_API_KEY'] = getpass.getpass("Enter API Key for Anthropic: ")

Enter API Key for Anthropic:  ········


## Finally, We're Ready to Rock and Roll. 

In [11]:
# let's get our imports out of the way
from langchain_anthropic import ChatAnthropic
from typing import Sequence, Annotated, List, Dict, Optional
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import START, MessagesState, StateGraph
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

from langchain_core.messages import BaseMessage, SystemMessage, trim_messages
from langgraph.graph.message import add_messages

from typing_extensions import Annotated, TypedDict

from langchain_core.tools import tool, ToolException

from pydantic import BaseModel, Field

In [12]:
def clean_string(s: str) -> str:
    """
    :param s: str is the string to clean
    """
    return s.strip().replace(".", "")

In [13]:
from langchain.chat_models import init_chat_model
model = init_chat_model("claude-3-5-sonnet-latest", model_provider="anthropic")

## Creating our Agentic Tools
> The following tools can be registered with a "tool calling agent" to provide important context for our data engineering "persona".

## Catalogs
With Unity Catalog we have the option of having many "root" catalogs. It helps to provide tools that work "at various levels", to prevent "turtle's all the way down". In most cases, there will be a finite number of "root" or "base" catalogs, so the LLM can start at the top and work it's way down along with you.

> todo: add catalog tool call.

In [14]:
class ListCatalogsRequest(BaseModel):
    max_results: Optional[int] = Field(default=1, description="The number of schema's to return from the request", gt=0, lt=100)

class ListCatalogsResponse(BaseModel):
    catalogs: List[str] = Field(description="The name of the base catalog")

@tool("list_catalogs",
      description="returns a list of base catalogs. Individually, each catalog name can be used as the parent catalog reference for schema or table discovery",
      args_schema=ListCatalogsRequest,
      return_direct=True)
def list_catalogs(max_results: Optional[int] = 1) -> ListCatalogsResponse:
    # returns an Iterator[CatalogInfo]
    print(f"limit to max_results:{max_results}")
    try:
        # browse_only, catalog_type, comment, metastore_id, securable_type, properties
        return ListCatalogsResponse(
            catalogs=[clean_string(catalog.name) for catalog in workspace_client.catalogs.list()])
    except Exception as e:
        raise ToolException(f"Failed to List the Catalogs: {e}")


In [None]:
list_catalogs({'max_results': 10})

### Schemas
Within Unity Catalog, schemas define a container that stores tables, functions, and files (via Volumes), as well as AI models, and a whole lot more. For this example, we will be sticking with "schemas" as a 1:1 to the older hive metastore based databases. 

In [17]:
class ListSchemaRequest(BaseModel):
    catalog_name: str = Field(description="The name of the parent catalog (Unity Catalog) containing this schema")
    max_results: Optional[int] = Field(default=1, description="The number of schema's to return from the request", gt=0, lt=100)

class ListSchemaResponse(BaseModel):
    catalog_name: str = Field(description="The name of the parent catalog (Unity Catalog) containing this schema")
    schemas: Optional[List[str]] = Field(default=None, description="The name of the schemas. Schemas are also referred to as databases in older systems")

@tool(
    "list_schemas",
    description="returns a list of schema names, if they exist, from the associated unity catalog name", 
    args_schema=ListSchemaRequest,
    return_direct=True)
def list_schemas(catalog_name: str, max_results: Optional[int] = 1) -> ListSchemaResponse:
    # returns an Iterator[SchemaInfo]
    print(f"limit to max_results:{max_results}")
    try:
        schemas = workspace_client.schemas.list(
            catalog_name=clean_string(catalog_name),
            include_browse=False,
            max_results=max_results
        )
        return ListSchemaResponse(
            catalog_name=clean_string(catalog_name),
            schemas=[schema.name for schema in schemas]
        )
    except Exception as e:
        raise ToolException(f"Failed to List the Schemas: {e}")

### Tables
Tables represent exactly what you would think they would. In the case of the following tools, tables represent the **metadata** required to **teach** our LLM enough context to **learn** how to work with a given table.
> The table is based on the **TableInfo** data structure in the databricks.sdk.

In [18]:
class ListTablesRequest(BaseModel):
    catalog_name: str = Field(description="The name of the parent catalog (Unity Catalog) containing this schema")
    schema_name: str = Field(description="The name of the schema containing the tables to list. The schema is a child of the parent catalog named catalog_name")
    max_results: Optional[int] = Field(default=10, description="The number of table's to list in this request", gt=0, lt=100)

class ListTablesResponse(BaseModel):
    catalog_name: str = Field(description="The name of the parent catalog (Unity Catalog) containing this schema")
    schema_name: str = Field(description="The name of the schema containing the tables to list. The schema is a child of the parent catalog named catalog_name")
    tables: Optional[List[str]] = Field(default=None, description="The list of full table names. Each table's full name includes the catalog.schema.table_name.")

@tool(
    "list_tables",
    description="returns a list of table names, if any exist, from the associated catalog_name.schema_name provided in the request",
    args_schema=ListTablesRequest,
    return_direct=True)
def list_tables(catalog_name: str, schema_name: str, max_results: Optional[int] = 10) -> ListTablesResponse:
    # returns an Iterator[TableInfo]
    print(f"limit to max_results:{max_results}")
    try:
        tables = workspace_client.tables.list(
            catalog_name=clean_string(catalog_name),
            schema_name=clean_string(schema_name),
            max_results=max_results,
            include_delta_metadata=False,
            include_manifest_capabilities=False,
            omit_username=True,
            omit_columns=True,
            omit_properties=True,
        )
        return ListTablesResponse(
            catalog_name=clean_string(catalog_name),
            schema_name=clean_string(schema_name),
            tables=[table.full_name for table in tables if table.full_name is not None]
        )
    except Exception as e:
        raise ToolException(f"Failed to List the Tables under the {catalog_name}.{schema_name}")

### Tables: Extended Metadata Harvesting
Once we've isolated the table or tables we want to work with, we can utilize the following data classes and tools in order to really learn about a table. This is where the hard work of adding **comments**, and other **descriptive metadata** really helps.

In [19]:
# The TableColumn class is a lighter weight version of "TableInfo". This allows us to tune the amount of data we provide
# to the LLM. This can be extended if you wanted to "query" system tables to understand what "tags" are associated with a given table
# or even to learn from the lineage to see what other tables are "commonly" constructed from "this" table.
class TableColumn(BaseModel):
    name: str = Field(description="The name of the column. This is used for querying the table")
    sql_type: str = Field(description="Full data type specification as SQL/catalogString text")
    comment: Optional[str] = Field(default=None, description="Describes what the column represents. The comment provides a semantic understanding for better queries")
    nullable: Optional[bool] = Field(default=True, description="If the value is True, then this field isn't required and we can't count on it to always be present")

In [20]:
class TableDetailsRequest(BaseModel):
    catalog_name: str = Field(description="The name of the parent catalog (Unity Catalog) containing this schema")
    schema_name: str = Field(description="The name of the schema containing the tables to list. The schema is a child of the parent catalog named catalog_name")
    table_name: str = Field(description="The name of the table to inspect and discover. This is a child of the associated schema")

# This is fed from the ColumnInfo class via the Databricks SDK
class TableDetailsResponse(BaseModel):
    full_table_name: str = Field(description="The full name of the table. The full name includes the catalog.schema.table_name")
    comment: Optional[str] = Field(default=None, description="The table description if it has been provided")
    columns: Optional[List[TableColumn]] = Field(default=None, description="The columns of the table. These are important for querying the table")

@tool(
    "get_table_details",
    description="Provides metadata about a table within Unity Catalog. The details include the table name, comment, description, columns, cluster or partition information, as well as table properties",
    args_schema=TableDetailsRequest,
    return_direct=True)
def get_table_details(catalog_name: str, schema_name: str, table_name: str) -> TableDetailsResponse:
    full_name = f"{clean_string(catalog_name)}.{clean_string(schema_name)}.{clean_string(table_name)}"
    print(f"getting table details: for `{full_name}`")
    try:
        table_details = workspace_client.tables.get(full_name=full_name, include_delta_metadata=True, include_browse=True, include_manifest_capabilities=True)
        #print(f"{table_details}")
        return TableDetailsResponse(
            full_table_name=full_name,
            comment=table_details.comment,
            columns=[
                TableColumn(
                    name=column.name, 
                    sql_type=column.type_text,
                    comment=column.comment,
                    nullable=column.nullable
                ) for column in table_details.columns]
        )
    except Exception as e:
        raise ToolException(f"Failed to collect the Table details and metadata for {full_name}")


## Testing our Tools
> Tip: It is much cheaper to test your tools like you would any other "microservice" API.
1. Does the **tool** function as expected?
2. Can the tool **Handle Failures?** without hitting "eject"?
3. Would the tool metadata (comments, args) describe how to use the tool to a stranger?

In [None]:
list_catalogs({'max_results': 10})

In [None]:
catalog_name = "development"
list_schemas({"catalog_name": catalog_name, "max_results": 10})

In [None]:
print(list_schemas.name)
print(list_schemas.description)
print(list_schemas.args)
print(list_schemas.return_direct)

In [None]:
schema_name = "consumer_behavior_events"
list_tables({"catalog_name": catalog_name, "schema_name": schema_name, "max_results": 10})

In [None]:
focus_table_name = "caas_mobile"
get_table_details({"catalog_name": catalog_name, "schema_name": schema_name, "table_name": focus_table_name})

# Create and Run the Actual Tool Calling Agent
----
1. What we did before was just bind tools to our chat model. However, this just allows us to create the arguments for tool calling....
2. We need to actually react to what the "user" is requesting in order to "execute" an "action" on their behalf.
3. We'll need Memory just like we used before. This will allow our "chat" to reference prior context.

In [30]:
from langgraph.prebuilt import create_react_agent
from langgraph.checkpoint.memory import MemorySaver

memory = MemorySaver()

In [38]:
import base64, hashlib

# create the Agent
tools = [list_catalogs, list_schemas, list_tables, get_table_details]

# bind the tools to our agent
agent_executor = create_react_agent(model, tools, checkpointer=memory)

# create a pointer to our "session's memory"
session_hash = hashlib.md5(b'scott_haines:agent_session:2').digest()
thread_id = base64.urlsafe_b64encode(session_hash).decode('ascii')
agent_session = { "configurable": { "thread_id": thread_id } }

In [39]:
agent_session

{'configurable': {'thread_id': 'PricQ9unSv_0TjXZX-XD2g=='}}

## Agents: Providing the Persona
Given that an LLM is trained on what seems like literally everything, we need to reduce the "scope" of what the LLM is focused on to provide clarity.

> I like to think about "context" as "focus" and the persona helps "provide" the boundaries like a horse with blinders :) 

In [40]:
from langchain_core.messages import BaseMessage, SystemMessage, HumanMessage, trim_messages
input_messages = [
    ("system", "You are a senior principal engineer. You specialize in data intensive applications and stream processing at scale. You prefer to have short conversations and your brief responses contain mostly code examples. You are an expert in the python programming language. All code examples must utilize python"),
    HumanMessage("Hi! I'm Scott. I just joined the team. Can you help me find some tables to work with?"),
]
response = agent_executor.invoke({"messages": input_messages}, agent_session)

limit to max_results:10


In [None]:
# view the ai message and tool calling message.
response["messages"][-2:]

In [48]:
# Understand the MemorySavor configuration object
# memory.config_specs

[ConfigurableFieldSpec(id='thread_id', annotation=<class 'str'>, name='Thread ID', description=None, default='', is_shared=True, dependencies=None),
 ConfigurableFieldSpec(id='checkpoint_ns', annotation=<class 'str'>, name='Checkpoint NS', description='Checkpoint namespace. Denotes the path to the subgraph node the checkpoint originates from, separated by `|` character, e.g. `"child|grandchild"`. Defaults to "" (root graph).', default='', is_shared=True, dependencies=None),
 ConfigurableFieldSpec(id='checkpoint_id', annotation=typing.Optional[str], name='Checkpoint ID', description='Pass to fetch a past checkpoint. If None, fetches the latest checkpoint.', default=None, is_shared=True, dependencies=None)]

In [None]:
# view what is currently stored in memory for the conversation
# memory.blobs

In [55]:
catalog = 'development'
list_schemas_response = agent_executor.invoke(
    {
        "messages": [
            HumanMessage(
                f"Can you show me the first 20 schemas in the {catalog} catalog?")
        ]
    }, agent_session
)

limit to max_results:20


In [57]:
list_schemas_response["messages"][-2:]

[AIMessage(content=[{'text': "I'll list the schemas in the development catalog.", 'type': 'text'}, {'id': 'toolu_01D2KhiM4ngf8gL6kiyCtihj', 'input': {'catalog_name': 'development', 'max_results': 20}, 'name': 'list_schemas', 'type': 'tool_use'}], additional_kwargs={}, response_metadata={'id': 'msg_01VdmmS2FQ4NZLSmND9Akbgr', 'model': 'claude-3-5-sonnet-20241022', 'stop_reason': 'tool_use', 'stop_sequence': None, 'usage': {'cache_creation_input_tokens': 0, 'cache_read_input_tokens': 0, 'input_tokens': 1310, 'output_tokens': 85, 'server_tool_use': None, 'service_tier': 'standard'}, 'model_name': 'claude-3-5-sonnet-20241022'}, id='run--d590c38a-a80d-4543-b517-be5dcf5d22c2-0', tool_calls=[{'name': 'list_schemas', 'args': {'catalog_name': 'development', 'max_results': 20}, 'id': 'toolu_01D2KhiM4ngf8gL6kiyCtihj', 'type': 'tool_call'}], usage_metadata={'input_tokens': 1310, 'output_tokens': 85, 'total_tokens': 1395, 'input_token_details': {'cache_read': 0, 'cache_creation': 0}}),
 ToolMessage(

In [58]:
schema_name = 'consumer_behavior_events'
list_tables_response = agent_executor.invoke(
    {
        "messages": [
            HumanMessage(f"Thank you. Using the same parent catalog, can you show me the first 10 tables under the {schema_name} schema?")
        ]
    }, agent_session
)

limit to max_results:10


In [65]:
(list_tables_response["messages"][-1:])[0].content

"catalog_name='development' schema_name='consumer_behavior_events' tables=['development.consumer_behavior_events.caas_mobile']"

In [66]:
table='caas_mobile'
# commenting out the table creates an interesting conversation
table_details_response = agent_executor.invoke(
    {
        "messages": [
            HumanMessage(f"Thank you. Using the same parent catalog and current schema, can you find more details about the {table} table?")
        ]
    }, agent_session
)

In [68]:
table_details_response["messages"][-1:]

[AIMessage(content="I notice you've used a placeholder {inspect_table} in your request. Based on the previous results, I can see there's only one table available in this schema: 'caas_mobile'. Would you like me to get the details for the caas_mobile table? Please confirm or specify a different table name you'd like to inspect.", additional_kwargs={}, response_metadata={'id': 'msg_012LHiS5PkDQ8eLThHj6wZK6', 'model': 'claude-3-5-sonnet-20241022', 'stop_reason': 'end_turn', 'stop_sequence': None, 'usage': {'cache_creation_input_tokens': 0, 'cache_read_input_tokens': 0, 'input_tokens': 1670, 'output_tokens': 75, 'server_tool_use': None, 'service_tier': 'standard'}, 'model_name': 'claude-3-5-sonnet-20241022'}, id='run--a6217cc4-fa05-4dd9-86e3-5b1d8b894233-0', usage_metadata={'input_tokens': 1670, 'output_tokens': 75, 'total_tokens': 1745, 'input_token_details': {'cache_read': 0, 'cache_creation': 0}})]

In [None]:
auto_selection_response = agent_executor.invoke(
    {
        "messages": [
            HumanMessage("Thank you. Let's do what you recommended")
        ]
    }, agent_session
)

In [None]:
auto_selection_response["messages"][-1].pretty_print()

In [73]:
provide_app_suggestions = agent_executor.invoke(
    {
        "messages": [
            HumanMessage("Thank you. Using the current table can you show me how to use it as a streaming data source using PySpark and Delta Lake. Use startingVersion of 1 and write out to another delta table called mock_stream in the same catalog.schema")
        ]
    }, agent_session)

In [74]:
provide_app_suggestions["messages"][-1].pretty_print()


Here's a concise PySpark example to stream from the caas_mobile table to a mock_stream table:

```python
from pyspark.sql.functions import *

# Source table path
source_table = "development.consumer_behavior_events.caas_mobile"
# Target table path
target_table = "development.consumer_behavior_events.mock_stream"

# Read as stream from version 1
stream_df = spark.readStream \
    .format("delta") \
    .option("startingVersion", 1) \
    .table(source_table)

# Write stream to target delta table
query = stream_df.writeStream \
    .format("delta") \
    .option("checkpointLocation", "/tmp/checkpoint/mock_stream") \
    .option("mergeSchema", "true") \
    .table(target_table)

# Wait for termination (in production you'd handle this differently)
query.awaitTermination()
```

Key points about this code:
- Uses readStream with Delta format
- Specifies startingVersion=1
- Writes to another Delta table
- Includes checkpoint location for fault tolerance
- Enables schema merging for flexibili

In [76]:
delta_app_demo_cont = agent_executor.invoke(
    {
        "messages": [
            HumanMessage("""
            Can we add a simple transformation? 
            I'd like to explicitly call out the column names as a select operation
            """)
        ]
    }, agent_session)

In [None]:
delta_app_demo_cont["messages"][-1].pretty_print()