In [1]:
import weaviate
from weaviate.classes.config import Configure, Property, DataType
from weaviate.classes.query import Filter
from weaviate.util import generate_uuid5
import subprocess
from contextlib import contextmanager
from typing import List, Union
from tqdm import tqdm
import joblib
import re
from pprint import pprint
import os

### The Weaviate Client

```Python
import weaviate
client = weaviate.connect_to_embedded(
    version="1.26.1",
    headers={
        "X-OpenAI-Api-Key": YOUR_OPENAI_API_KEY
    },
)
```

In [3]:
@contextmanager
def suppress_subprocess_output():
    """
    Context manager that suppresses the standard output and error 
    of any subprocess. Popen calls within this context.
    """
    # Store the original Popen
    original_popen = subprocess.Popen

    def patched_popen(*args, **kwargs):
        # Redirect the stdout and stderr to subprocess.DEVNULL
        kwargs['stdout'] = subprocess.DEVNULL
        kwargs['stderr'] = subprocess.DEVNULL
        return original_popen(*args, **kwargs)

    try:
        # Apply the patch by replacing subprocess.Popen with patched_popen
        subprocess.Popen = patched_popen
        # Yield control back to the context
        yield
    finally:
        # Ensure that the original Popen method is restored
        subprocess.Popen = original_popen


In [19]:
with suppress_subprocess_output():
    client = weaviate.connect_to_embedded(
        persistence_data_path="./.collections",
        environment_variables={
            "ENABLE_API_BASED_MODULES": "true",
            "ENABLE_MODULES": 'text2vec-transformers, reranker-transformers', # We will be using a transformer model
            "TRANSFORMERS_INFERENCE_API":"http://127.0.0.1:5000/", # The endpoint the weaviate API will be using to vectorize
            "RERANKER_INFERENCE_API":"http://127.0.0.1:5000/" # The endpoint the weaviate API will be using to rerank
        }
    )

In [4]:
def print_properties(item):
    print(
        json.dumps(
            item.properties,
            indent=2, sort_keys=True, default=str
        )
    )

In [5]:

def print_object_properties(obj: Union[dict, list]) -> None:
    t = ''
    if isinstance(obj, dict):
        for x, y in obj.items():
            if x == 'article_content':
                t += f'{x}: {y[:100]}...(truncated)\n'
            elif x == 'main_vector':
                t+= f'{x}: {y[:30]}...(truncated)\n'
            elif x == 'chunk':
                t+= f'{x}: {y[:100]}...(truncated)\n'

            else:
                t+= f'{x}: {y}\n'
    else:
        for l in obj:
            for x, y in l.items():
                if x == 'article_content':
                    t += f'{x}: {y[:100]}...(truncated)\n'
                elif x == 'main_vector':
                    t+= f'{x}: {y[:30]}...(truncated)\n'
                elif x == 'chunk':
                    t+= f'{x}: {y[:100]}...(truncated)\n'

                else:
                    t+= f'{x}: {y}\n'
            t += "\n\n"
        
    print(t)

Creating a collection, adding elements to it and querying over it.

## Configuring the database

### Creating a Collection

In [6]:
data = joblib.load("data.joblib")
data[0]

{'place': 'Grand Canyon',
 'state': 'Arizona',
 'description': 'A stunning canyon with vast vistas and incredible geology.',
 'best_season_to_visit': 'Spring, Fall',
 'attractions': 'South Rim, Havasu Falls, Skywalk',
 'budget': 'Moderate',
 'user_ratings': 4.8,
 'last_updated': '2023-10-01T00:00:00Z'}

In [7]:
len(data)

20

In [9]:
type(data[0])

dict

In [13]:
print_object_properties(data[0])

place: Grand Canyon
state: Arizona
description: A stunning canyon with vast vistas and incredible geology.
best_season_to_visit: Spring, Fall
attractions: South Rim, Havasu Falls, Skywalk
budget: Moderate
user_ratings: 4.8
last_updated: 2023-10-01T00:00:00Z



The dataset is a set of places to visit, with some properties describing each location. The properties here are `place, state, description, best_season_to_visit, attractions, budget, user_ratings, last_updated`. When creating a collection, we must create one property for each key in this dictionary and add the expected datatype. 

### Configuring the Vectorizer

Not every property must be vectorized, it depends on the data and the information we want to retrieve.

In this case, let's use the following properties to be vectorized:

`place, state, description, best_season_to_visit, attractions, budget`

These properties will be appended to each other and then vectorized. When defining the property, we might choose to add the property name or not in the vectorization.

In [9]:
vectorizer_config = [Configure.NamedVectors.text2vec_transformers(
                name="vector", # This is the name we will need to access the vectors of the objects in our collection
                source_properties=['place', 'state', 'description', 'best_season_to_visit', 'attractions', 'budget'], # which properties should be used to generate a vector, they will be appended to each other when vectorizing
                vectorize_collection_name = False, # This tells the client to not vectorize the collection name. 
                                                   # If True, it will be appended at the beginning of the text to be vectorized
                inference_url="http://127.0.0.1:5000", # Since we are using an API based vectorizer, we need to pass the URL used to make the calls 
                                                       # This was setup in our Flask application
            )]

### The Properties

In a collection, the features of each data point are called Properties.

In [15]:
# Delete the collection in case it exists
if client.collections.exists("example_collection"):
    client.collections.delete("example_collection")

In [16]:
if not client.collections.exists('example_collection'): # Creates only if the collection does not exist
    collection = client.collections.create(
            name='example_collection',
            vectorizer_config=vectorizer_config, # The config we defined before,
            reranker_config=Configure.Reranker.transformers(), # The reranker config

            properties=[  # Define properties
            Property(name="place",vectorize_property_name=True,data_type= DataType.TEXT),
            Property(name="state",vectorize_property_name=True, data_type=DataType.TEXT),
            Property(name="description",vectorize_property_name=True, data_type=DataType.TEXT),
            Property(name="best_season_to_visit",vectorize_property_name=True, data_type=DataType.TEXT),
            Property(name="attractions",vectorize_property_name=True, data_type=DataType.TEXT),
            Property(name="budget",vectorize_property_name=True, data_type=DataType.TEXT),
            Property(name="user_ratings", data_type=DataType.NUMBER),
            Property(name="last_updated", data_type=DataType.DATE),

        ]
        )
else:
    collection = client.collections.get("example_collection")

Running it creates a collection and returns the collection. Printing it shows the collection configuration.

In [17]:
print(collection)

<weaviate.Collection config={
  "name": "Example_collection",
  "description": null,
  "generative_config": null,
  "inverted_index_config": {
    "bm25": {
      "b": 0.75,
      "k1": 1.2
    },
    "cleanup_interval_seconds": 60,
    "index_null_state": false,
    "index_property_length": false,
    "index_timestamps": false,
    "stopwords": {
      "preset": "en",
      "additions": null,
      "removals": null
    }
  },
  "multi_tenancy_config": {
    "enabled": false,
    "auto_tenant_creation": false,
    "auto_tenant_activation": false
  },
  "properties": [
    {
      "name": "place",
      "description": null,
      "data_type": "text",
      "index_filterable": true,
      "index_range_filters": false,
      "index_searchable": true,
      "nested_properties": null,
      "tokenization": "word",
      "vectorizer_config": null,
      "vectorizer": null,
      "vectorizer_configs": {
        "text2vec-transformers": {
          "skip": false,
          "vectorize_property_

If we try to create a collection that already exists, an exception will be thrown:

In [18]:
try:
    collection = client.collections.create(
        name='example_collection',

        vectorizer_config=vectorizer_config, # The config we defined before,
    
        properties=[  # Define properties
        Property(name="place",vectorize_property_name=True,data_type= DataType.TEXT),
        Property(name="state",vectorize_property_name=True, data_type=DataType.TEXT),
        Property(name="description",vectorize_property_name=True, data_type=DataType.TEXT),
        Property(name="best_season_to_visit",vectorize_property_name=True, data_type=DataType.TEXT),
        Property(name="attractions",vectorize_property_name=True, data_type=DataType.TEXT),
        Property(name="budget",vectorize_property_name=True, data_type=DataType.TEXT),
        Property(name="user_ratings", data_type=DataType.NUMBER),
        Property(name="last_updated", data_type=DataType.DATE),
                 
    ]
    )
except Exception as e:
    print(e)

Collection may not have been created properly.! Unexpected status code: 422, with response body: {'error': [{'message': 'class name Example_collection already exists'}]}.


In [19]:
client.collections.list_all().keys()

dict_keys(['Example_collection'])

### Adding elements into a Collection

When we add an element, two important steps happen in the background:

1. The information is vectorized (as configured in the collection definition)
2. The HNSW index is updated to optimize search. 

In [20]:
# Set up a batch process with specified fixed size and concurrency
with collection.batch.fixed_size(batch_size=1, concurrent_requests=1) as batch:
    # Iterate over a subset of the dataset
    for document in tqdm(data):
            # Generate a UUID based on the article_content text for unique identification
            uuid = generate_uuid5(document)

            # Add the object to the batch with properties and UUID. 
            # properties expects a dictionary with the keys being the properties.
            batch.add_object(
                properties=document,
                uuid=uuid,
            )

100%|██████████| 20/20 [00:06<00:00,  3.01it/s]


In [21]:
len(collection)

20

## Querying on a collection

- Query on metadata
- Query with semantic search
- Query with BM25
- Query with filtering

### Filters

In [22]:
# Here we are fetching 2 objects with a filter by property, filtering by 'user_ratings, only objects with value greater or equal to 3.5'
result = collection.query.fetch_objects(limit = 2, filters = Filter.by_property('user_ratings').greater_or_equal(3.5))

In [23]:
result

QueryReturn(objects=[Object(uuid=_WeaviateUUIDInt('c99763a3-46a0-59d4-831b-af9bc290260c'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'state': 'California', 'best_season_to_visit': 'Spring', 'place': 'Hollywood', 'description': 'Famous district in Los Angeles known as the entertainment capital of the world.', 'user_ratings': 4.2, 'budget': 'Moderate', 'attractions': 'Walk of Fame, Hollywood Sign', 'last_updated': datetime.datetime(2023, 10, 1, 0, 0, tzinfo=datetime.timezone.utc)}, references=None, vector={}, collection='Example_collection'), Object(uuid=_WeaviateUUIDInt('9e5ba590-8c75-5b53-9b0a-8a9c161004ad'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'state': 'New York', 'best_season_to_visit': 'Winter', 'place

In [24]:
result.objects

[Object(uuid=_WeaviateUUIDInt('c99763a3-46a0-59d4-831b-af9bc290260c'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'state': 'California', 'best_season_to_visit': 'Spring', 'place': 'Hollywood', 'description': 'Famous district in Los Angeles known as the entertainment capital of the world.', 'user_ratings': 4.2, 'budget': 'Moderate', 'attractions': 'Walk of Fame, Hollywood Sign', 'last_updated': datetime.datetime(2023, 10, 1, 0, 0, tzinfo=datetime.timezone.utc)}, references=None, vector={}, collection='Example_collection'),
 Object(uuid=_WeaviateUUIDInt('9e5ba590-8c75-5b53-9b0a-8a9c161004ad'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'state': 'New York', 'best_season_to_visit': 'Winter', 'place': 'Times Square', 

In [25]:
obj = result.objects[0]

In [26]:
obj.properties

{'state': 'California',
 'best_season_to_visit': 'Spring',
 'place': 'Hollywood',
 'description': 'Famous district in Los Angeles known as the entertainment capital of the world.',
 'user_ratings': 4.2,
 'budget': 'Moderate',
 'attractions': 'Walk of Fame, Hollywood Sign',
 'last_updated': datetime.datetime(2023, 10, 1, 0, 0, tzinfo=datetime.timezone.utc)}

### Semantic Search

In [27]:
result = collection.query.near_text(query = 'I want suggestions to travel during Winter. I want cheap places.', limit = 4)

In [28]:
# Let's iterate over the result objects and return their properties
for obj in result.objects:
    print_object_properties(obj.properties)

state: New York
budget: Low
place: Times Square
description: Bustling pedestrian intersection and major commercial hub.
user_ratings: 4.3
best_season_to_visit: Winter
attractions: Broadway Theaters, New Year’s Eve Ball Drop
last_updated: 2023-10-01 00:00:00+00:00

state: Montana
best_season_to_visit: Summer
place: Glacier National Park
description: Park known for its rugged mountains and alpine forests.
user_ratings: 4.8
budget: Moderate
attractions: Going-to-the-Sun Road, Grinnell Glacier
last_updated: 2023-10-01 00:00:00+00:00

state: Utah
best_season_to_visit: Spring, Fall
place: Zion National Park
description: Beautiful park known for its impressive canyons and towering cliffs.
user_ratings: 4.7
budget: Moderate
attractions: The Narrows, Angels Landing
last_updated: 2023-10-01 00:00:00+00:00

state: Massachusetts
budget: Moderate
place: Cape Cod
description: Popular tourist destination known for its beaches and quaint towns.
user_ratings: 4.5
best_season_to_visit: Summer
attraction

query over the elements with `budget = Low`:

In [29]:
result = collection.query.near_text(query = 'I want suggestions to travel during Winter. I want cheap places.', 
                                    filters = Filter.by_property('budget').equal('Low'),
                                    limit = 4)

In [30]:
# Let's iterate over the result objects and return their properties
for obj in result.objects:
    print_object_properties(obj.properties)

state: New York
best_season_to_visit: Winter
place: Times Square
description: Bustling pedestrian intersection and major commercial hub.
user_ratings: 4.3
budget: Low
attractions: Broadway Theaters, New Year’s Eve Ball Drop
last_updated: 2023-10-01 00:00:00+00:00

state: California
best_season_to_visit: Spring, Summer
place: Alcatraz Island
description: Famed former prison island located in San Francisco Bay.
user_ratings: 4.4
budget: Low
attractions: Cellhouse Tour, Alcatraz Lighthouse
last_updated: 2023-10-01 00:00:00+00:00

state: Pennsylvania
best_season_to_visit: Spring, Fall
place: Gettysburg National Military Park
description: Historic site of a major Civil War battle.
user_ratings: 4.6
budget: Low
attractions: Gettysburg Museum, Battlefield Tours
last_updated: 2023-10-01 00:00:00+00:00

state: New York
best_season_to_visit: Spring, Fall
place: Statue of Liberty
description: Iconic symbol of freedom and democracy in the United States.
user_ratings: 4.5
budget: Low
attractions: E

Possible values on a filter, by using `.contains_any`:

In [31]:
result = collection.query.near_text(query = 'I want suggestions to travel during Winter. I want cheap places.', 
                                    filters = Filter.by_property('budget').contains_any(['Low', 'Moderate']),
                                    limit = 4)

In [32]:
# Let's iterate over the result objects and return their properties
for obj in result.objects:
    print_object_properties(obj.properties)

state: New York
budget: Low
place: Times Square
description: Bustling pedestrian intersection and major commercial hub.
user_ratings: 4.3
best_season_to_visit: Winter
attractions: Broadway Theaters, New Year’s Eve Ball Drop
last_updated: 2023-10-01 00:00:00+00:00

state: Montana
budget: Moderate
place: Glacier National Park
description: Park known for its rugged mountains and alpine forests.
user_ratings: 4.8
best_season_to_visit: Summer
attractions: Going-to-the-Sun Road, Grinnell Glacier
last_updated: 2023-10-01 00:00:00+00:00

state: Utah
best_season_to_visit: Spring, Fall
place: Zion National Park
description: Beautiful park known for its impressive canyons and towering cliffs.
user_ratings: 4.7
budget: Moderate
attractions: The Narrows, Angels Landing
last_updated: 2023-10-01 00:00:00+00:00

state: Massachusetts
best_season_to_visit: Summer
place: Cape Cod
description: Popular tourist destination known for its beaches and quaint towns.
user_ratings: 4.5
budget: Moderate
attraction

### BM25 search

In [33]:
result = collection.query.bm25(query = 'I want suggestions to travel during Winter. I want cheap places.', 
                                    filters = Filter.by_property('budget').contains_any(['Low', 'Moderate']),
                                    limit = 4)

In [34]:
# Let's iterate over the result objects and return their properties
for obj in result.objects:
    print_object_properties(obj.properties)

state: New York
budget: Low
place: Times Square
description: Bustling pedestrian intersection and major commercial hub.
user_ratings: 4.3
best_season_to_visit: Winter
attractions: Broadway Theaters, New Year’s Eve Ball Drop
last_updated: 2023-10-01 00:00:00+00:00



### Hybrid Search -- RRF

In [35]:
result = collection.query.hybrid(query = 'I want suggestions to travel during Winter. I want cheap places.', 
                                    filters = Filter.by_property('budget').contains_any(['Low', 'Moderate']),
                                    alpha = 0.3,
                                    limit = 4)

In [36]:
# Let's iterate over the result objects and return their properties
for obj in result.objects:
    print_object_properties(obj.properties)

state: New York
best_season_to_visit: Winter
place: Times Square
description: Bustling pedestrian intersection and major commercial hub.
user_ratings: 4.3
budget: Low
attractions: Broadway Theaters, New Year’s Eve Ball Drop
last_updated: 2023-10-01 00:00:00+00:00

state: Montana
best_season_to_visit: Summer
place: Glacier National Park
description: Park known for its rugged mountains and alpine forests.
user_ratings: 4.8
budget: Moderate
attractions: Going-to-the-Sun Road, Grinnell Glacier
last_updated: 2023-10-01 00:00:00+00:00

state: Utah
best_season_to_visit: Spring, Fall
place: Zion National Park
description: Beautiful park known for its impressive canyons and towering cliffs.
user_ratings: 4.7
budget: Moderate
attractions: The Narrows, Angels Landing
last_updated: 2023-10-01 00:00:00+00:00

state: Massachusetts
budget: Moderate
place: Cape Cod
description: Popular tourist destination known for its beaches and quaint towns.
user_ratings: 4.5
best_season_to_visit: Summer
attraction

### Reranking

In [37]:
from weaviate.classes.query import Rerank

response = collection.query.near_text(
    query="'I want suggestions to travel during Winter. I want cheap and fun places.'",  
    limit=5,
    rerank=Rerank(
        prop="attractions",                   # The property to rerank on
        query="Fun places"  # If not provided, the original query will be used
    )
)

In [44]:
# Let's iterate over the result objects and return their properties
for obj in response.objects:
    print_object_properties(obj.properties)

state: Florida
best_season_to_visit: Winter
place: Epcot Center
description: Theme park at Walt Disney World Resort known for celebrating human achievement.
user_ratings: 4.6
budget: High
attractions: Spaceship Earth, World Showcase
last_updated: 2023-10-01 00:00:00+00:00

state: Montana
budget: Moderate
place: Glacier National Park
description: Park known for its rugged mountains and alpine forests.
user_ratings: 4.8
best_season_to_visit: Summer
attractions: Going-to-the-Sun Road, Grinnell Glacier
last_updated: 2023-10-01 00:00:00+00:00

state: Utah
budget: Moderate
place: Zion National Park
description: Beautiful park known for its impressive canyons and towering cliffs.
user_ratings: 4.7
best_season_to_visit: Spring, Fall
attractions: The Narrows, Angels Landing
last_updated: 2023-10-01 00:00:00+00:00

state: Massachusetts
best_season_to_visit: Summer
place: Cape Cod
description: Popular tourist destination known for its beaches and quaint towns.
user_ratings: 4.5
budget: Moderate
a

In [45]:
client.close()