In [1]:
from haystack.components.writers import DocumentWriter
from haystack.components.preprocessors import DocumentCleaner
from haystack import Pipeline


from haystack_integrations.document_stores.pinecone import PineconeDocumentStore
from haystack_integrations.components.embedders.cohere.document_embedder import CohereDocumentEmbedder

document_store = PineconeDocumentStore(
  index="prototype",
  metric="cosine",
  dimension=384,
  spec={"serverless": {"region": "us-east-1", "cloud": "aws"}},
  )


  from tqdm.autonotebook import tqdm


In [2]:
document_embedder = CohereDocumentEmbedder(model="embed-multilingual-light-v3.0", input_type="search_document")
document_writer = DocumentWriter(document_store)
document_cleaner = DocumentCleaner()


In [4]:
preprocessing_pipeline = Pipeline()
preprocessing_pipeline.add_component(instance=document_cleaner, name="document_cleaner")
preprocessing_pipeline.add_component(instance=document_embedder, name="document_embedder")
preprocessing_pipeline.add_component(instance=document_writer, name="document_writer")

preprocessing_pipeline.connect("document_cleaner", "document_embedder")
preprocessing_pipeline.connect("document_embedder", "document_writer")

In [13]:
from pathlib import Path
from haystack import Document


output = "../data/summaries.txt"
with open('../data/summaries.txt', 'r', encoding='utf-8') as f:
    sum_list = eval(f.read())

documents = [Document(content=summary) for summary in sum_list]

preprocessing_pipeline.run({"document_cleaner": {"documents": documents}})


Calculating embeddings: 100%|██████████| 6/6 [00:02<00:00,  2.87it/s]
Upserted vectors: 100%|██████████| 178/178 [00:02<00:00, 68.55it/s]


 'document_writer': {'documents_written': 178}}