In [66]:
import os
import weaviate
from weaviate.client import WeaviateClient
from weaviate.classes.init import Auth
from weaviate.classes.config import Configure, Property, DataType
from dotenv import load_dotenv
import pandas as pd
from typing import List, Dict, Any


# Load environment variables (`DEMO_WEAVIATE_URL` and `DEMO_WEAVIATE_RO_KEY`)
# From the provided `.env` file
load_dotenv("weaviate-setup/.env")

True

In [8]:
def connect_to_demo_db() -> WeaviateClient:
    """
    Helper function to connect to the demo Weaviate database.
    This database instance has the necessary data loaded.
    """
    headers = {
        "X-OpenAI-Api-Key": os.getenv("OPENAI_API_KEY")
    }

    client = weaviate.connect_to_weaviate_cloud(
        cluster_url=os.getenv("DEMO_WEAVIATE_URL"),
        auth_credentials=Auth.api_key(os.getenv("DEMO_WEAVIATE_RO_KEY")),
        headers=headers,
    )
    print(client.is_ready())  # Should print: True
    return client

def connect_to_local_db() -> WeaviateClient:
    """
    Helper function to connect to a local Weaviate database.
    Make sure you have a local Weaviate instance running with the necessary data loaded.
    """
    client = weaviate.connect_to_local()
    print(client.is_ready())
    return client

def pandas_csv_to_dict(file_path: str, delimiter: str = ',', encoding: str = 'utf-8') -> List[Dict[str, Any]]:
    """
    Read CSV using pandas and convert to list of dictionaries.
    Handles different data types and missing values better.
    
    Args:
        file_path: Path to CSV file
        delimiter: Column delimiter
        encoding: File encoding
    Returns:
        List of dictionaries with proper data types
    """
    try:
        # Read CSV with pandas and Convert to list of dictionaries
        df = pd.read_csv(file_path, delimiter=delimiter, encoding=encoding)
        data = df.to_dict(orient='records', into=dict)
        
        return data
    
    except FileNotFoundError:
        raise FileNotFoundError(f"File not found: {file_path}")
    except Exception as e:
        raise Exception(f"Error reading CSV with pandas: {e}")

### EXTRACT DATA 

In [96]:
file_path='/Users/phlip7/Works/repos/data-ai-training/machine-learning/vector-databases/data/movies_data.csv'

lst_data = pandas_csv_to_dict(file_path, delimiter=',', encoding='utf-8')
print('original file nb records', len(lst_data))
lst_data

# Extract only Movie Title and Description from lst_data
data_objects = [
    {
        'Title': item.get('Movie Title'),
        'Description': item.get('Description')
    }
    for item in lst_data
]

print(f"Extracted {len(data_objects)} records")
print("\nFirst 3 records:")
for i, record in enumerate(data_objects[:3]):
    print(f"\n{i+1}. {record['Title']}")
    print(f"   Description: {record['Description'][:100]}...")  # Show first 100 chars


original file nb records 120
Extracted 120 records

First 3 records:

1. Arctic Chuckles
   Description: Penguins trying stand-up comedy to uplift spirits in the cold winter....

2. Ballad of the Lonely Lighthouse
   Description: A reclusive lighthouse keeper's life is illuminated by unexpected friendships, challenging his solit...

3. Ballet & Bullets
   Description: An unexpected crime comedy about a mob boss who enrolls in a ballet class as a disguise....


### LOAD DATA INTO WEAVIATE

In [None]:
# Connect to your local Weaviate instance
client = connect_to_local_db()

# List all collections in the Weaviate instance
response = client.collections.list_all(simple=False)
print(response)

collection_name = 'Movies'
# Delete the collection if it alredy exists
if client.collections.exists(collection_name):
    client.collections.delete(collection_name)


# Create a collection
collec = client.collections.create(
    name=collection_name,
    vector_config=Configure.Vectors.text2vec_ollama(  # Configure the Ollama embedding integration
        api_endpoint="http://ollama:11434",  # If using Docker you might need: http://host.docker.internal:11434
        model="nomic-embed-text",  # The model to use
    ),
    generative_config=Configure.Generative.ollama(
        api_endpoint="http://ollama:11434",
        model="llama3"  # or any other model you have available
    ),
    properties=[
        Property(name="Title", data_type=DataType.TEXT),
        Property(name="Description", data_type=DataType.TEXT)
    ]
)

# insert data into the collection
collec = client.collections.use(collection_name)
with collec.batch.fixed_size(batch_size=200) as batch:
    for obj in data_objects:
        batch.add_object(properties=obj)

print(f"Imported & vectorized {len(collec)} objects into the {collection_name}  collection")


client.close()

True
{'Jeopardy': _CollectionConfig(name='Jeopardy', description=None, generative_config=None, inverted_index_config=_InvertedIndexConfig(bm25=_BM25Config(b=0.75, k1=1.2), cleanup_interval_seconds=60, index_null_state=False, index_property_length=False, index_timestamps=False, stopwords=_StopwordsConfig(preset=<StopwordsPreset.EN: 'en'>, additions=None, removals=None)), multi_tenancy_config=_MultiTenancyConfig(enabled=False, auto_tenant_creation=False, auto_tenant_activation=False), properties=[_Property(name='round', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer=None, vectorizer_configs={'text2vec-ollama': _PropertyVectorizerConfig(skip=False, vectorize_property_name=False)}), _Property(name='question', description=None, data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, in

### CHECK LOADED DATA

In [103]:
client.connect()
collec = client.collections.use(collection_name)

# count records
response = collec.aggregate.over_all(total_count=True)
print(response.total_count)

# select 3 records
result = collec.query.fetch_objects(
    limit=3,
    return_properties=["Title", "Description"]
)

for obj in result.objects:
    print(obj.properties)

client.close()

120
{'description': 'A postman discovers a hidden town where letters come alive, each with stories yearning to be delivered.', 'title': 'Labyrinth of Letters'}
{'description': 'An AI designed for serious tasks starts experiencing humor, leading to unexpected and comical situations.', 'title': "Robo's First Laugh"}
{'description': 'Set in the year 2200, Stellar Horizon follows a crew of astronauts on a daring mission to explore a distant galaxy and uncover its hidden mysteries.', 'title': 'Stellar Horizon'}


### Vector searches ( semantic search )

In [107]:
import weaviate.classes as wvc

client.connect()
collec = client.collections.use(collection_name)

for query in ["love", "amorous", "adventure movie set in outer galaxy"]:  # Loop through multiple query terms
    response = collec.query.near_text(  # Vector search
        query=query,
        limit=2,
        return_metadata=wvc.query.MetadataQuery(distance=True),
    )

    print(f"===== Search results for '{query}'. =====")  # Print the query term
    for o in response.objects:
        # print(o)
        print(o.properties["title"])            # Show which titles were found
        print(o.properties["description"])      # Show the description
        print(f"{o.metadata.distance:.3f}\n")   # What was the distance?

client.close()

===== Search results for 'love'. =====
Love in Binary
Two engineers find love while developing a groundbreaking communication technology.
0.300

Love Beyond Codes
Two AI robots fall in love, creating chaos and comedy in their tech world.
0.312

===== Search results for 'amorous'. =====
Love in Binary
Two engineers find love while developing a groundbreaking communication technology.
0.441

Love Beyond Codes
Two AI robots fall in love, creating chaos and comedy in their tech world.
0.450

===== Search results for 'adventure movie set in outer galaxy'. =====
Galactic Odyssey
In a future where humanity has colonized distant planets, a space explorer embarks on a perilous journey to save Earth from an alien threat.
0.252

Stellar Horizon
Set in the year 2200, Stellar Horizon follows a crew of astronauts on a daring mission to explore a distant galaxy and uncover its hidden mysteries.
0.268



### Keyword filtering and Keyword searches

In [110]:
import weaviate.classes as wvc
from weaviate.classes.query import Filter

client.connect()
collec = client.collections.use(collection_name)

# Keyword filtering 
filter = Filter.by_property("title").like("love")
response = collec.query.fetch_objects(
    limit=3,
    filters=filter,
    return_properties=["title", "description"]
)
print("===== Keyword filter results for 'love' in title =====")
for o in response.objects:
    print(o.properties["title"])                # Show which titles were found
    print(o.properties["description"], "\n")    # Show the description


# Keyword searches produce a score indicating relevance of the result
response_search = collec.query.bm25(
    query="love",
    limit=3,
    return_metadata=wvc.query.MetadataQuery(score=True)
)
print("===== Keyword search results for 'love' =====")
for obj in response_search.objects:
    print(obj.properties["title"])
    print(obj.properties["description"])
    print(f"{obj.metadata.score:.3f}\n")  # What was the score?

client.close()

===== Keyword filter results for 'love' in title =====
Labyrinths of Love
An Italian romance drama where two estranged lovers find themselves lost in the same maze. 

Love Beyond Codes
Two AI robots fall in love, creating chaos and comedy in their tech world. 

Love in Binary
Two engineers find love while developing a groundbreaking communication technology. 

===== Keyword search results for 'love' =====
Love in Binary
Two engineers find love while developing a groundbreaking communication technology.
2.778

Love Beyond Codes
Two AI robots fall in love, creating chaos and comedy in their tech world.
2.622

Love in Venice
A romantic drama set against the backdrop of the picturesque canals of Venice, where two strangers find love in the most unexpected way.
2.365



### Hydrid search

In [94]:
# Hybrid search performs 2 searches under the hood
# - vector search
# - keyword search
# - fusion algorith combines the result sets

import weaviate.classes as wvc

client.connect()
collec = client.collections.use(collection_name)

response = collec.query.hybrid(  # Hybrid search
    query="stellar",
    limit=3,
    # Fetch the score and explain_score
    return_metadata=wvc.query.MetadataQuery(score=True, explain_score=True),
)

for o in response.objects:
    print(o.properties["title"])                        # Show which titles were found
    print(f"score: {o.metadata.score:.3f}")             # What was the score
    print(f"explain_score: {o.metadata.explain_score}") # Explain the score
    print()


alpha = 0  # Effectively a keyword search
response = collec.query.hybrid(
    query="stellar",
    limit=3,
    alpha=alpha,
    return_metadata=wvc.query.MetadataQuery(score=True, explain_score=True),
)

print(f"===== Results with alpha: {alpha} =====")
for o in response.objects:
    print(o.properties["title"])  # Show which titles were found
    print(f"score: {o.metadata.score:.3f}")  # What was the distance?
    print(f"explain_score: {o.metadata.explain_score}\n")  # What was the distance?


client.close()

Stellar Horizon
score: 1.000
explain_score: 
Hybrid (Result Set keyword,bm25) Document f0aec210-51e3-4732-b073-6f0ee99ffe9a: original score 3.9484837, normalized score: 0.3 - 
Hybrid (Result Set vector,hybridVector) Document f0aec210-51e3-4732-b073-6f0ee99ffe9a: original score 0.62502515, normalized score: 0.7

Celestial Canvas
score: 0.661
explain_score: 
Hybrid (Result Set vector,hybridVector) Document 93b3d3ca-6d87-4795-95b9-7f23fb2761ac: original score 0.61160654, normalized score: 0.661443

Glittering Gossamer Galaxy
score: 0.502
explain_score: 
Hybrid (Result Set vector,hybridVector) Document b0a38f28-4484-4696-a9f7-4a93ab3123a6: original score 0.55611145, normalized score: 0.5019836

===== Results with alpha: 0 =====
Stellar Horizon
score: 1.000
explain_score: 
Hybrid (Result Set keyword,bm25) Document f0aec210-51e3-4732-b073-6f0ee99ffe9a: original score 3.9484837, normalized score: 1



### RAG ( Retrieval Augmented Generation )

In [None]:
# RAG ( Retrieval Augmented Generation )
# search for relevant documents and use them to answer a question to an LLM
# 2 steps process:
# - Perform search
# - Send retrieved data and task to generative model
# Reduces generative AI models shortcomings like:
# - Hallucinations
# - Lack of data

import weaviate.classes as wvc

client.connect()
collec = client.collections.use(collection_name)

response = collec.generate.near_text(
    query="science fiction",
    limit=3,
    single_prompt="""
    Summarize the description:
    {description} for this movie {title}.
    """
)
print
for o in response.objects:
    print(o.properties["title"])    # Show which titles were found
    print(o.generated)              # RAG output
    print()


response = collec.generate.near_text(
    query="science fiction",
    limit=10,
    grouped_task="""
    Are there any common themes in these movies?
    Explain 2 in very short points,  and list the relevant movies:
    """
)
print("===== RAG grouped task output =====")
print(response.generated)  # Print the generated text


client.close()

Starstruck Lovers
The movie "Starstruck Lovers" is a romantic story that takes place in space. An astronaut and an alien scientist meet and discover that despite being from different species, they have more in common than they thought. The film follows their journey as they fall in love, exploring the possibility of cosmic romance between humans and aliens.

Solaris Ascendant
Here is a summary of the description:

"Solaris Ascendant" is a science fiction film set in a futuristic world where advanced technology and magical/mystical forces coexist. The story follows a rebellious "technomancer" (a person who combines magic with technology) as they search for a hidden truth.

The Clockwork Constellations
The movie "The Clockwork Constellations" is set in a steampunk universe and follows the story of an inventor who tries to realign the stars, leading to a cosmic adventure.

Here are 2 common themes found in these movies, explained in very short points and listed with relevant movie titles: