In [489]:
import weaviate
import weaviate.classes as wvc
import os
import pandas as pd
import requests
import json
from dotenv import load_dotenv
from sklearn.decomposition import PCA

In [499]:
load_dotenv()

True

In [458]:
movies = pd.read_csv('movie.csv')
tags = pd.read_csv('tag.csv')

In [459]:
max_tags = 5
def truncate_and_remove_duplicates(tag_list):
    truncated_list = tag_list[:max_tags]
    unique_tags = list(dict.fromkeys(truncated_list))
    return unique_tags

In [460]:
# -- dropping useless columns and null values -- #
tags = tags.drop(columns=['timestamp','userId'])
tags = tags[~tags.isnull().any(axis=1)]

# -- grouping tags of the same movie -- #
tags_grouped = tags.groupby('movieId')['tag'].apply(list).reset_index()
tags_grouped['tag'] = tags_grouped['tag'].apply(truncate_and_remove_duplicates)

# -- creating a dataframe that contains movie title, genre and tags -- #
movies_tag = pd.merge(movies, tags_grouped, on='movieId', how='left')
movies_tag = movies_tag.drop(columns=['movieId'])

In [461]:
# -- since i'm using the free plan I just use the first 3000 movies -- #
movies_tag = movies_tag.iloc[:3000]

In [500]:
try: 
    # -- connection to Weaviate client -- #
    client = weaviate.connect_to_wcs(
        cluster_url=os.getenv("WCD_URL"),
        auth_credentials=weaviate.auth.AuthApiKey(os.getenv("WCD_API_KEY")),
        headers={
            "X-Cohere-Api-Key": os.environ["COHERE_APIKEY"] 
        }
    )
    # -- create Movies collection -- #
    movies_collection = client.collections.create(
        name="Movies_new",
        properties=[
        wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="genres", data_type=wvc.config.DataType.TEXT),
        wvc.config.Property(name="tag", data_type=wvc.config.DataType.TEXT_ARRAY),
        ],
        # -- using 2 named vectors with binary quantization for title and tag -- #
        vectorizer_config=[
            wvc.config.Configure.NamedVectors.text2vec_cohere(
                name="title",
                source_properties=["title"],
                vector_index_config=wvc.config.Configure.VectorIndex.flat(
                    quantizer=wvc.config.Configure.VectorIndex.Quantizer.bq()
                ),
            ),
            wvc.config.Configure.NamedVectors.text2vec_cohere(
                name="tag",
                source_properties=["tag"],
                vector_index_config=wvc.config.Configure.VectorIndex.flat(
                    quantizer=wvc.config.Configure.VectorIndex.Quantizer.bq()
                ),
            ),
           
        ],
        generative_config=wvc.config.Configure.Generative.cohere() 
    )
    
    # -- import data into Weaviate -- #
    movies_objs = []
    for index, row in movies_tag.iterrows():
        movies_objs.append({
            "title": row["title"],
            "genres": row["genres"],
            "tag": row["tag"],
        })
    movies = client.collections.get("Movies_new")
    movies.data.insert_many(movies_objs)
finally:
    client.close()