In [None]:
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
import os
import time

load_dotenv()

def initialize_pinecone():
    """Initialize Pinecone client"""
    return Pinecone(api_key=os.getenv("PINECONE_API_KEY"))


def get_or_create_index(pc, index_name="quickstart", dimension=1024):
    """Get existing index or create if it doesn't exist"""
    # Check if index already exists
    existing_indexes = [index.name for index in pc.list_indexes()]
    
    if index_name not in existing_indexes:
        print(f"Creating new index: {index_name}")
        pc.create_index(
            name=index_name,
            dimension=dimension,
            metric="cosine",
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
        
        # Wait for the index to be ready
        while not pc.describe_index(index_name).status['ready']:
            time.sleep(1)
    else:
        print(f"Using existing index: {index_name}")
    
    return pc.Index(index_name)


def store_embeddings(data, index, namespace="ns1", model="multilingual-e5-large", batch_size=100):
    """Create embeddings and store them in Pinecone with batching"""
    pc = initialize_pinecone()
    
    # Process in batches
    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]
        
        # Generate embeddings for batch
        embeddings = pc.inference.embed(
            model=model,
            inputs=[d['text'] for d in batch],
            parameters={"input_type": "passage", "truncate": "END"}
        )
        
        # Prepare vectors
        vectors = [
            {
                "id": d['id'],
                "values": e['values'],
                "metadata": {'text': d['text']}
            }
            for d, e in zip(batch, embeddings)
        ]
        
        # Upload batch to Pinecone
        index.upsert(vectors=vectors, namespace=namespace)
        
        print(f"Processed batch {i//batch_size + 1}")
    
    return index.describe_index_stats()


def query_embeddings(query_text, index, namespace="ns1", top_k=10, model="multilingual-e5-large"):
    """Query the Pinecone index"""
    pc = initialize_pinecone()
    
    # Generate embedding for query
    embedding = pc.inference.embed(
        model=model,
        inputs=[query_text],
        parameters={"input_type": "query"}
    )
    
    # Query the index
    results = index.query(
        namespace=namespace,
        vector=embedding[0].values,
        top_k=top_k,
        include_values=False,
        include_metadata=True
    )
    
    return results


In [8]:
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
import os
import time

load_dotenv()

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))


index_name = "quickstart"

pc.create_index(
    name=index_name,
    dimension=1024, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

data = [
    {"id": "vec1", "text": "Apple is a popular fruit known for its sweetness and crisp texture."},
    {"id": "vec2", "text": "The tech company Apple is known for its innovative products like the iPhone."},
    {"id": "vec3", "text": "Many people enjoy eating apples as a healthy snack."},
    {"id": "vec4", "text": "Apple Inc. has revolutionized the tech industry with its sleek designs and user-friendly interfaces."},
    {"id": "vec5", "text": "An apple a day keeps the doctor away, as the saying goes."},
    {"id": "vec6", "text": "Apple Computer Company was founded on April 1, 1976, by Steve Jobs, Steve Wozniak, and Ronald Wayne as a partnership."}
]

embeddings = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[d['text'] for d in data],
    parameters={"input_type": "passage", "truncate": "END"}
)
print(embeddings[0])


# Wait for the index to be ready
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

index = pc.Index(index_name)

vectors = []
for d, e in zip(data, embeddings):
    vectors.append({
        "id": d['id'],
        "values": e['values'],
        "metadata": {'text': d['text']}
    })

index.upsert(
    vectors=vectors,
    namespace="ns1"
)

print(index.describe_index_stats())


query = "Tell me about the tech company known as Apple."

embedding = pc.inference.embed(
    model="multilingual-e5-large",
    inputs=[query],
    parameters={
        "input_type": "query"
    }
)


results = index.query(
    namespace="ns1",
    vector=embedding[0].values,
    top_k=3,
    include_values=False,
    include_metadata=True
)

print(results)