<a href="https://colab.research.google.com/github/muniappabalaji/TimeSeries/blob/main/LLM/RAG/PineCone.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q pinecone pinecone-client sentence-transformers

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/587.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m276.5/587.6 kB[0m [31m8.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.6/587.6 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.3/259.3 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
from google.colab import userdata
import time

In [None]:
# Securely get the API key from Colab secrets
try:
    PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')
except userdata.SecretNotFoundError:
    print("ERROR: PINECONE_API_KEY not found in Colab secrets.")
    print("Please follow Step 1 in the instructions to add your key.")
    # Exit the script if the key is not found
    exit()

In [None]:
index_name = "colab-quickstart"

# Initialize the Pinecone client
pc = Pinecone(api_key=PINECONE_API_KEY)
print(f"--- Pinecone initialized for index: '{index_name}' ---")

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')
# This model creates 384-dimensional vectors
embedding_dim = model.get_sentence_embedding_dimension()

# Check if the index already exists. If not, create it.
if index_name not in pc.list_indexes().names():
    print(f"Index '{index_name}' not found. Creating a new one...")
    pc.create_index(
        name=index_name,
        dimension=embedding_dim,
        metric="cosine", # Cosine similarity is great for semantic search
        spec=ServerlessSpec(cloud='aws', region='us-east-1') # Use ServerlessSpec directly
    )
    # Wait for the index to be ready
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)
    print(f"--- Index '{index_name}' created successfully with dimension {embedding_dim} ---")
else:
    print(f"--- Index '{index_name}' already exists. Connecting to it. ---")

# Connect to your index
index = pc.Index(index_name)
print("\nIndex Stats:")
print(index.describe_index_stats())

In [None]:
documents = [
    "The capital of France is Paris, a city known for its art and culture.",
    "Photosynthesis is the process by which plants use sunlight to create food.",
    "The Python programming language is widely used for web development and data science.",
    "Mount Everest is the Earth's highest mountain above sea level.",
    "A black hole is a region of spacetime where gravity is so strong that nothing can escape."
]

# Generate embeddings (vectors) for each document
print("\n--- Creating vector embeddings for our documents ---")
embeddings = model.encode(documents)

In [None]:
vectors_to_upsert = []
for i, (doc, emb) in enumerate(zip(documents, embeddings)):
    vectors_to_upsert.append(
        (f"doc_{i}", emb.tolist(), {"text": doc})
    )

print("\n--- Upserting vectors into the Pinecone index ---")
index.upsert(vectors=vectors_to_upsert)

print("\nUpsert complete. New Index Stats:")
print(index.describe_index_stats())

In [None]:
print("PERFORMING SEMANTIC SEARCH")

query = "What is the name of the tallest mountain?"

# 1. Create the vector embedding for the query
query_embedding = model.encode(query).tolist()

# 2. Query Pinecone to find the most similar vectors
results = index.query(
    vector=query_embedding,
    top_k=2, # Return the top 2 most similar results
    include_metadata=True
)

# 3. Print the results
print(f"Query: '{query}'\n")
print("Top Results:")
for match in results['matches']:
    score = match['score']
    text = match['metadata']['text']
    print(f"  - Score: {score:.4f}")
    print(f"    Text: {text}\n")

In [None]:
print("\n--- Cleaning up. Deleting the index... ---")
pc.delete_index(index_name)
print(f"Index '{index_name}' has been deleted.")