In [439]:
import weaviate
import weaviate.classes as wvc
import os
import pandas as pd
import requests
import json
from dotenv import load_dotenv
from sklearn.decomposition import PCA

In [440]:
load_dotenv()

True

In [441]:
load_dotenv()
client = weaviate.connect_to_wcs(
    cluster_url=os.getenv("WCD_URL2"),
    auth_credentials=weaviate.auth.AuthApiKey(os.getenv("WCD_API_KEY2")),
    headers={
        "X-Cohere-Api-Key": os.environ["COHERE_APIKEY2"] 
    }
)

In [442]:
movies = pd.read_csv('movie.csv')
tags = pd.read_csv('tag.csv')

In [443]:
max_tags = 5
def truncate_and_remove_duplicates(tag_list):
    truncated_list = tag_list[:max_tags]
    unique_tags = list(dict.fromkeys(truncated_list))
    return unique_tags

In [444]:
# -- data preparation -- #
tags = tags.drop(columns=['timestamp','userId'])
tags = tags[~tags.isnull().any(axis=1)]
tags_grouped = tags.groupby('movieId')['tag'].apply(list).reset_index()
tags_grouped['tag'] = tags_grouped['tag'].apply(truncate_and_remove_duplicates)
movies_tag = pd.merge(movies, tags_grouped, on='movieId', how='left')
movies_tag = movies_tag.drop(columns=['movieId'])

In [445]:
movies_tag = movies_tag.iloc[:3000]

In [446]:
try: 
    # -- create Movies collection -- #
    questions = client.collections.create(
        name="Movies",
        properties=[
        wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="genres", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="tag", data_type=wvc.config.DataType.TEXT_ARRAY),
        ],
        vectorizer_config=[
            wvc.config.Configure.NamedVectors.text2vec_cohere(name="title", source_properties=["title"]),
                vector_index_config=wvc.config.Configure.VectorIndex.hnsw(
                quantizer=wvc.config.Configure.VectorIndex.Quantizer.bq(rescore_limit=50),
                distance_metric=wvc.config.VectorDistances.HAMMING,
            ),
            wvc.config.Configure.NamedVectors.text2vec_cohere(name="tag", source_properties=["tag"]),
                vector_index_config=wvc.config.Configure.VectorIndex.hnsw(
                    quantizer=wvc.config.Configure.VectorIndex.Quantizer.bq(rescore_limit=50),
                    distance_metric=wvc.config.VectorDistances.HAMMING,
                ),
        ],
        generative_config=wvc.config.Configure.Generative.cohere() 
    )
    
    # -- import data into Weaviate -- #
    question_objs = []
    for index, row in movies_tag.iterrows():
        question_objs.append({
            "title": row["title"],
            "genres": row["genres"],
            "tag": row["tag"],
        })
    movies = client.collections.get("Movies")
    movies.data.insert_many(question_objs)
finally:
    client.close()  # Close client gracefully

In [450]:
try:
    client.connect()
    questions = client.collections.get("Movies")
    response = questions.query.near_text(
        query="taxi driver",
        target_vector="title",
        limit=1
    )
    response_tag = questions.query.near_text(
        query= str(response.objects[0].properties['tag']),
        target_vector="tag",
        limit=4
    )
    print(response_tag.objects[1].properties)
finally:
    client.close()

{'tag': ['assassin', 'dark comedy', 'killer-as-protagonist', 'mafia', 'classic'], 'title': 'Grosse Pointe Blank (1997)', 'genres': 'Comedy|Crime|Romance'}
