In [1]:
import uuid
from pinecone import Pinecone, ServerlessSpec
import cohere
import openai
import PyPDF2
import os
from dotenv import load_dotenv
load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=PINECONE_API_KEY)

# initialize cohere
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
co = cohere.Client(COHERE_API_KEY)
embedding_model_name = "embed-v4.0"

# openai doesn't need to be initialized, but need to set api key
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [2]:
def extract_text_from_pdf(pdf_path):
    """Extract text content from a PDF file"""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
        return text.strip()
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return None

In [3]:
doc_text = extract_text_from_pdf(pdf_path="..\kb\Lecture_0.pdf")

In [5]:
response = co.embed(
    texts=[doc_text],
    model=embedding_model_name,
    input_type="search_document",
    embedding_types=["float"]
)

In [6]:
len(response.embeddings.float_[0]) # 1536

1536

In [None]:
import cohere
import pinecone
import uuid

# 1. Initialize clients
co = cohere.Client("YOUR_COHERE_API_KEY")
pinecone.init(api_key="YOUR_PINECONE_API_KEY", environment="YOUR_ENV")  # e.g., us-west1-gcp

# 2. Ensure index exists (or create it)
INDEX_NAME = "example-index"
if INDEX_NAME not in pinecone.list_indexes():
    pinecone.create_index(
        name=INDEX_NAME,
        dimension=1024,        # must match your embedding size
        metric="cosine"
    )
index = pinecone.Index(INDEX_NAME)

# 3. Your text to embed (embedded previously via Cohere)
texts = [
    "First document text",
    "Second doc text",
    # ...
]

# 4. Generate embeddings with Cohere
response = co.embed(
    texts=texts,
    model="embed-english-v3.0",
    input_type="search_document",
    truncate="END"
)
embeddings = response.embeddings  # list of vectors length 1024 :contentReference[oaicite:1]{index=1}

# 5. Prepare records for upsert
vectors = []
for text, vec in zip(texts, embeddings):
    # Use consistent unique IDs so you can re-upsert (updates instead of duplicates)
    vid = str(uuid.uuid5(uuid.NAMESPACE_URL, text))
    vectors.append((vid, vec, {"text": text}))

# 6. Upsert (batching if many)
index.upsert(vectors=vectors, namespace="my-namespace")

print(f"Upserted {len(vectors)} vectors into {INDEX_NAME}")
