In [451]:
import weaviate
import weaviate.classes as wvc
import os
import pandas as pd
import requests
import json
from dotenv import load_dotenv
from sklearn.decomposition import PCA

In [452]:
load_dotenv()

True

In [453]:
load_dotenv()
client = weaviate.connect_to_wcs(
    cluster_url=os.getenv("WCD_URL2"),
    auth_credentials=weaviate.auth.AuthApiKey(os.getenv("WCD_API_KEY2")),
    headers={
        "X-Cohere-Api-Key": os.environ["COHERE_APIKEY2"] 
    }
)

WeaviateGRPCUnavailableError: 
Weaviate v1.25.2 makes use of a high-speed gRPC API as well as a REST API.
Unfortunately, the gRPC health check against Weaviate could not be completed.

This error could be due to one of several reasons:
- The gRPC traffic at the specified port is blocked by a firewall.
- gRPC is not enabled or incorrectly configured on the server or the client.
    - Please check that the server address and port (pacant1-sandbox-z4lk9273.grpc.weaviate.network:443) are correct.
- your connection is unstable or has a high latency. In this case you can:
    - increase init-timeout in `weaviate.connect_to_local(additional_config=wvc.init.AdditionalConfig(timeout=wvc.init.Timeout(init=X)))`
    - disable startup checks by connecting using `skip_init_checks=True`


In [None]:
movies = pd.read_csv('movie.csv')
tags = pd.read_csv('tag.csv')

In [None]:
max_tags = 5
def truncate_and_remove_duplicates(tag_list):
    truncated_list = tag_list[:max_tags]
    unique_tags = list(dict.fromkeys(truncated_list))
    return unique_tags

In [None]:
# -- data preparation -- #
tags = tags.drop(columns=['timestamp','userId'])
tags = tags[~tags.isnull().any(axis=1)]
tags_grouped = tags.groupby('movieId')['tag'].apply(list).reset_index()
tags_grouped['tag'] = tags_grouped['tag'].apply(truncate_and_remove_duplicates)
movies_tag = pd.merge(movies, tags_grouped, on='movieId', how='left')
movies_tag = movies_tag.drop(columns=['movieId'])

In [None]:
movies_tag = movies_tag.iloc[:3000]

In [None]:
try: 
    # -- create Movies collection -- #
    questions = client.collections.create(
        name="Movies_new",
        properties=[
        wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="genres", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="tag", data_type=wvc.config.DataType.TEXT_ARRAY),
        ],
        vectorizer_config=[
            wvc.config.Configure.NamedVectors.text2vec_cohere(name="title", source_properties=["title"]),
                vector_index_config=wvc.config.Configure.VectorIndex.hnsw(
                    quantizer=wvc.config.Configure.VectorIndex.Quantizer.bq(rescore_limit=50),
                    distance_metric=wvc.config.VectorDistances.HAMMING,
            ),
            wvc.config.Configure.NamedVectors.text2vec_cohere(name="tag", source_properties=["tag"]),
                vector_index_config=wvc.config.Configure.VectorIndex.hnsw(
                    quantizer=wvc.config.Configure.VectorIndex.Quantizer.bq(rescore_limit=50),
                    distance_metric=wvc.config.VectorDistances.HAMMING,
                ),
        ],
        generative_config=wvc.config.Configure.Generative.cohere() 
    )
    
    # -- import data into Weaviate -- #
    question_objs = []
    for index, row in movies_tag.iterrows():
        question_objs.append({
            "title": row["title"],
            "genres": row["genres"],
            "tag": row["tag"],
        })
    movies = client.collections.get("Movies_new")
    movies.data.insert_many(question_objs)
finally:
    client.close()  # Close client gracefully

In [454]:
try:
    client.connect()
    questions = client.collections.get("Movies")
    response = questions.query.near_text(
        query="taxi driver",
        target_vector="title",
        limit=1
    )
    response_tag = questions.query.near_text(
        query= str(response.objects[0].properties['tag']),
        target_vector="tag",
        limit=4
    )
    print(response_tag.objects[1].properties)
finally:
    client.close()

WeaviateGRPCUnavailableError: 
Weaviate v1.25.2 makes use of a high-speed gRPC API as well as a REST API.
Unfortunately, the gRPC health check against Weaviate could not be completed.

This error could be due to one of several reasons:
- The gRPC traffic at the specified port is blocked by a firewall.
- gRPC is not enabled or incorrectly configured on the server or the client.
    - Please check that the server address and port (pacant1-sandbox-z4lk9273.grpc.weaviate.network:443) are correct.
- your connection is unstable or has a high latency. In this case you can:
    - increase init-timeout in `weaviate.connect_to_local(additional_config=wvc.init.AdditionalConfig(timeout=wvc.init.Timeout(init=X)))`
    - disable startup checks by connecting using `skip_init_checks=True`
