In [13]:
from astrapy.db import AstraDB
from langchain_astradb import AstraDBVectorStore
import langchain
from langchain_openai import OpenAIEmbeddings
import json
import os
from dotenv import load_dotenv

load_dotenv()

token = os.getenv("ASTRA_DB_APPLICATION_TOKEN")
dbEndpoint = os.getenv("ASTRA_DB_API_ENDPOINT")
openaiToken = os.getenv("OPENAI_API_KEY")
csvPath = ("../project/data/rotten_tomatoes_movies.csv")
collectionName = "vector_movies"

# Configurar el motor de OpenAI
engine = "gpt-4"
embeddings = OpenAIEmbeddings(api_key=openaiToken, model="text-embedding-3-large")

def get_embeddings(text):
    query_results = embeddings.embed_query(text)
    print(query_results)
    return query_results

In [16]:
# Initialization
db = AstraDB(
  token=token,
  api_endpoint=dbEndpoint)
# Create a collection
collection = db.create_collection(collectionName, dimension=3072, metric="cosine")

# Initialize Langchain vector store
vstore = AstraDBVectorStore (
    embedding=embeddings,
    collection_name=collectionName,
    astra_db_client=db
)
print (f"Connected to AstraDB: {db.get_collections()}")

# Procesar cada línea y obtener los embeddings
# Leer el documento de texto
documents = []
with open(csvPath, 'r', encoding="utf8") as file:
    lines = file.readlines()
    
if os.path.exists('documents.json'):
    with open('documents.json', 'r') as file:
        documents = json.load(file)
    for document in documents:
        res = collection.upsert_one(document)
else:
    for index, line in enumerate(lines):
        # Extraer el texto de cada línea
        # Obtener el embedding para el texto
        vector = get_embeddings(line)
        # Crear el documento
        document = {
            "_id": str(index + 1),
            "text": line,
            "vector": vector
        }
        # Insertar el documento en la base de datos
        # Reemplaza 'collection' con tu objeto de colección de la base de datos 
        # res = vstore.add_documents(document)  # Aqui usamos upsert si existe se actualiza si no, se crea # This errors out with 'str' object has no attribute 'page_content'
        res = collection.upsert(document)
        documents.append(document)
        
        # Guardar los documentos en un archivo JSON
        with open ('documents.json', 'w') as file:
            json.dump(documents, file)
        print (documents)

Connected to AstraDB: {'status': {'collections': ['vector_movies']}}
[-0.01827823634906148, 0.017645415671140023, -0.030493162010102614, 0.019264261128468053, -0.026401897155604943, -0.053775104193310025, -0.03749834906031978, 0.009234777114216542, -0.02350269171983711, 0.024194381300210444, 0.05351019878888966, -0.055070179069056265, -0.02235478444360749, 0.00030422337137173935, -0.03567346710102325, -0.011184750601779576, -0.011015507862335464, 0.009808731683653968, 0.03514366374276343, -0.023355524120320492, 0.0519502222340135, 0.020426887027294564, 0.006136895884762582, 0.055217346668572885, 0.016070720493666982, 0.01677712497134675, -0.02032386970763293, 0.008719690515908379, 0.03817532001809622, -0.008454787905455853, 0.03169993818878411, -0.026019261396861736, 0.019617463367307936, -0.01745409779176842, -0.009977974423098078, 0.04724085532419127, -0.004698331202503719, 0.03373085106211344, 0.02145706022391089, 0.0015093894387722161, 0.013973579406587331, -0.014444516656363123, -

  res = collection.upsert(document)


ValueError: {'errors': [{'message': "Document size limitation violated: number of elements an indexable Array (property 'vector') has (3072) exceeds maximum allowed (1000)", 'errorCode': 'SHRED_DOC_LIMIT_VIOLATION'}]}