In [20]:
import os
from langchain_astradb import AstraDBVectorStore
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from datasets import load_dataset
from dotenv import load_dotenv

load_dotenv()

ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_API_ENDPOINT = os.getenv("ASTRA_DB_API_ENDPOINT")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [41]:
embedding = OpenAIEmbeddings()
vstore = AstraDBVectorStore(
    embedding=embedding,
    collection_name="movies",
    token=ASTRA_DB_APPLICATION_TOKEN,
    api_endpoint=ASTRA_DB_API_ENDPOINT
)

movies_dataset = load_dataset("csv", data_files="../project/data/rotten_tomatoes_movies.csv")["train"]
print("An example entry:")
print(movies_dataset[16])

Generating train split: 0 examples [00:00, ? examples/s]

An example entry:
{'rotten_tomatoes_link': 'm/10002673-prowler', 'movie_title': 'The Prowler (Cost of Living )', 'movie_info': 'After being frightened by a peeping Tom at her mansion in the suburbs the beautiful Susan Gilvray Evelyn Keyes calls the police for help. When a policeman Webb Garwood Van Heflin arrives he becomes infatuated with Susan and the two engage in an affair. Susan soon ends their relationship choosing to remain with her husband John Sherry Hall. However Webbs obsession with her continues to grow until he begins plotting to kill John and cash in on his life insurance policy', 'critics_consensus': None, 'content_rating': 'PG', 'genres': 'Drama, Mystery & Suspense', 'directors': 'Joseph Losey', 'authors': 'Robert Thoeren, Hugo Butler, Dalton Trumbo', 'actors': 'Van Heflin, Evelyn Keyes, Katherine Warren, John Maxwell, Emerson Treacy, Madge Blake, Wheaton Chambers, Robert Osterloh, Sherry Hall, Louise Lorimer, George Nader, Benny Burt, Louise M. Bates, Steve Carruthers,

In [42]:
# Process metadata and convert to LangChain documents
docs = []
for entry in movies_dataset:
    metadata = {"genres": entry["genres"]}
    if entry['tomatometer_status']:
        # Add metadata movie status to the metadata dictionary
        for rating in str(round(entry['tomatometer_rating'])).split(";"):
            metadata[rating] = round(entry['tomatometer_rating']) 
        for description in str(entry['movie_info']).split(";"):
            metadata[description] = entry['movie_info']
    # Add a LangChain document with the movie title and metadata tags
    # print(entry['movie_title'])
    doc = Document(page_content=str(entry['movie_title']), metadata=metadata)
    docs.append(doc)
 
# Compute embeddings for each document and store in the database   
inserted_ids = vstore.add_documents(docs)
print(f"\nInserted {len(inserted_ids)} documents.")

# Show movie titles that are similar to a specific quote
results = vstore.similarity_search("Spiderman is amazing", k=3)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

ValueError: API Exception while running bulk insertion: {'errors': [{'message': "Document key name constraints violated: property name ('Always trouble prone the life of teenager Percy Jackson Logan Lerman gets a lot more complicated when he learns hes the son of the Greek god Poseidon. At a training ground for the children of deities Percy learns to harness his divine powers and prepare for the adventure of a lifetime  he must prevent a feud among the Olympians from erupting into a devastating war on Earth and rescue his mother from the clutches of Hades god of the underworld') contains character(s) not allowed", 'errorCode': 'SHRED_DOC_KEY_NAME_VIOLATION'}]}