In [None]:
# !pip install weaviate-client


## Lesson 1:  Creating a Vector Database and Exploring Queries

### Before we create an agent that can help us with our vector database queries, let's figure out what we might need help with.

## Get keys and URLs to connect to the Weaviate Client

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

WEAVIATE_URL = os.getenv("WEAVIATE_URL")
WEAVIATE_KEY = os.getenv("WEAVIATE_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


print("Weaviate URL:", WEAVIATE_URL)
print("Weaviate API Key:", WEAVIATE_KEY[:10])
print("OpenAI API Key:", OPENAI_API_KEY[:10])

## Connect to Weaviate

You need to pass in your Weaviate Cloud URL and KEY.

In [None]:
import weaviate
from weaviate.classes.init import Auth

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,
    auth_credentials=Auth.api_key(WEAVIATE_KEY),
    headers = {
        "X-OpenAI-Api-Key": OPENAI_API_KEY
    },
)

print("Client ready:", client.is_ready())

## Load the financial contracts dataset

Let's load the pre-vectorized financial contracts dataset from HuggingFace. This data set is what we will pass to our Weaviate 

This data set come with vectors already created by the Snowflake/snowflake-arctic-embed-l-v2.0 embedding model. When we upload data to Weaviate, the embeddings are created for us by default, but since we have them already we will upload them with our original data to save time.

In [None]:
from datasets import load_dataset

# Load the financial contracts dataset
dataset = load_dataset(
    "weaviate/agents", 
    "query-agent-financial-contracts", 
    split="train", 
    streaming=True
)

# Let's examine the first few items
print("Dataset loaded successfully!")
print("\n--- Sample contract data ---")

for i, item in enumerate(dataset):
    if i >= 2:  # Just show 2 examples
        break
    print(f"\nContract {i+1}:")
    print("Properties:", item["properties"])
    print("Vector length:", len(item["vector"]) if item["vector"] else "No vector")

## Create a collection for contracts

In [None]:
from weaviate.classes.config import Configure

# Delete collection if it exists
if client.collections.exists("FinancialContract"):
    client.collections.delete("FinancialContract")

# Create the collection with a description for our agent
contracts = client.collections.create(
    name="FinancialContract",
    description="A collection of financial contracts with terms, conditions, and legal clauses",
    vector_config=Configure.Vectors.text2vec_weaviate(
        model="Snowflake/snowflake-arctic-embed-l-v2.0",
        source_properties=["contract_text"]
    ),
)

print("Collection 'FinancialContract' created successfully!")

## Load data into Weaviate

Now we'll stream the data from HuggingFace directly into our Weaviate collection.

In [None]:
# Reload the dataset for importing
dataset = load_dataset(
    "weaviate/agents", 
    "query-agent-financial-contracts", 
    split="train", 
    streaming=True
)

# Get the collection
contracts = client.collections.get("FinancialContract")

# Import data with batch processing
with contracts.batch.fixed_size(batch_size=100) as batch:
    for item in dataset:
        # Add the object with pre-computed vector
        batch.add_object(
            properties=item["properties"],
            vector=item["vector"]
        )

print(f"Data import completed!")
print(f"Total contracts in collection: {len(contracts)}")

## Basic contract exploration

Let's explore what's in our contract collection.

In [None]:
# Get some basic stats about our collection
print("=== Collection Stats ===")
print(f"Total contracts: {len(contracts)}")

# Sample some contracts to understand the data structure
response = contracts.query.fetch_objects(limit=3)

print("\n=== Sample Contracts ===")
for i, contract in enumerate(response.objects):
    print(f"\nContract {i+1}:")
    for prop, value in contract.properties.items():
        # Truncate long text for readability
        if isinstance(value, str) and len(value) > 200:
            print(f"  {prop}: {value[:200]}...")
        else:
            print(f"  {prop}: {value}")

## Vector search

Now we will write a basic vector search to find contracts by meaning.

In [None]:
# This is a simple function to make our outputs a little prettier
import json
def print_properties(item):
    print(
        json.dumps(
            item.properties,
            indent=2, sort_keys=True, default=str
        )
    )

In [None]:
from weaviate.classes.query import MetadataQuery

# Search for employment contracts with roles, salaries and benefits
response = contracts.query.near_text(
    query="Employment contracts with job roles, salaries, and employee benefits",
    limit=3,
    return_metadata=MetadataQuery(distance=True)
)


for item in response.objects:
    print_properties(item)
    print(item.metadata.distance)


### But what if we need something more recent from an author we trust?

## Vector search with filters

Let's add some filters to hone our search a little


In [None]:
from weaviate.classes.query import Filter
from datetime import datetime, timezone

# Search for employment contracts
response = contracts.query.near_text(
    query="I need contracts that have good info on what I need to be looking for when signing a new contract for a job I am going to be getting",
    limit=3,
    filters=Filter.by_property("author").equal("Edward Elric") & Filter.by_property("date").greater_than(datetime(2023, 1, 1, tzinfo=timezone.utc)),
    return_metadata=MetadataQuery(distance=True)
)


for item in response.objects:
    print_properties(item)
    print(item.metadata.distance)


Can our queries improve? Or is this as good as it gets?

## Query optimization with an LLM

Let's leverage an LLM to help us imporve our queries 

In [None]:
import openai
# Initialize OpenAI client
client = openai.OpenAI(api_key=OPENAI_API_KEY)

# Send a query to OpenAI
response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "user", "content": """I'm searching a vector database of 
  contracts. 
      
  My current query is: "I need contracts that have good info on what I need to be looking for when signing a new contract for a job I am going to be getting. I want to make lot sof money and not be taken advantage of by the man"

  Can you suggest 2-3 better ways to phrase this that 
  would find more relevant results? 
  Just give me the improved queries, nothing else."""}
    ]
)

print(response.choices[0].message.content)

In [None]:
# Search for employment contracts
response = contracts.query.near_text(
    query="Seeking resources or contracts that detail important factors to review when accepting a new job offer.",
    limit=3,
    return_metadata=MetadataQuery(distance=True)
)


for item in response.objects:
    print_properties(item)
    print(item.metadata.distance)

## Generative search - Ask questions about contracts

Now let's use generative search to get explanations about contracts.

In [None]:
from weaviate.classes.config import Reconfigure

financialcontract = client.collections.use("FinancialContract")

financialcontract.config.update(
    generative_config=Reconfigure.Generative.openai(
        model="gpt-4o-mini"  # Update the generative model
    )
)

In [None]:
# Ask about contract risks using the collection's configured generative model
response = contracts.generate.near_text(
    query="contract risks liability issues problems",
    limit=3,
    single_prompt="Based on this contract content: {contract_type} {contract_text}, what are the main risks or potential issues a business should be aware of? Provide 3 key concerns.",
)

print("\nSource contracts and generated outputs:")
for i, contract in enumerate(response.objects):
    print(f"Contract {i+1}: {list(contract.properties.keys())}")
    print(f"Generated output: {contract.generative.text}")  

In [None]:
response = contracts.generate.near_text(
    query="contract risks liability issues problems",
    limit=3,
    grouped_task="Based on this contract content, what are the main risks or potential issues a business should be aware of? Provide 3 key concerns.",
    grouped_properties=["contract_type", "contract_text"]  # Optional, to limit prompt length
)

# Print the generated output for the group
print("Generated output for all contracts:")
print(response.generative.text)



In [None]:
# Clean up
client.close()