In [1]:
import os
import time
from uuid import uuid4
from dotenv import load_dotenv

In [2]:
# 1. Setup
load_dotenv()
if not os.getenv("PINECONE_API_KEY"):
    raise ValueError("‚ùå PINECONE_API_KEY missing in .env")

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# --- STEP 1: LOAD & SPLIT ---

loader = TextLoader("./files/speech.txt")
docs = loader.load()
docs

split = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 50
)

chuncks = split.split_documents(docs)
print(len(chuncks))

10


In [9]:
# --- STEP 2: CLOUD INIT (Pinecone) ---
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index_name = "rag-assignment-index"

In [None]:
if index_name not in pc.list_indexes().names():
    print(f"   -> Creating new index '{index_name}'...")
    pc.create_index(
        name=index_name,
        dimension=384, # Dimensions for 'all-MiniLM-L6-v2'
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
    time.sleep(10) # Wait for index to be ready

In [10]:
# --- STEP 3: EMBED & STORE ---
embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [16]:
import os
import time
from uuid import uuid4
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_huggingface import HuggingFaceEmbeddings # <--- Import this
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# 1. SETUP & LOAD
load_dotenv()
if not os.getenv("PINECONE_API_KEY"): raise ValueError("‚ùå PINECONE_API_KEY missing")

print("üöÄ Loading & Splitting...")
loader = TextLoader("./files/speech.txt")
docs = loader.load()
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_documents(docs)

# 2. DEFINE EMBEDDINGS (The Missing Step)
# We must define this BEFORE using it in PineconeVectorStore
print("üß† Initializing Embeddings...")
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") 

# 3. INFRASTRUCTURE CHECK (The Fix for 'NotFound')
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index_name = "rag-assignment-index"

# Check if index exists. If not, create it.
existing_indexes = pc.list_indexes().names()
if index_name not in existing_indexes:
    print(f"üèóÔ∏è Index '{index_name}' not found. Creating it now...")
    pc.create_index(
        name=index_name,
        dimension=384, # Must match 'all-MiniLM-L6-v2' (384 dims)
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
    print("‚è≥ Waiting 15s for cloud resources to spin up...")
    time.sleep(15) 
else:
    print(f"‚úÖ Index '{index_name}' ready.")

# 4. CONNECT & UPLOAD
print("üîå Connecting to Index...")
vector_store = PineconeVectorStore(index_name=index_name, embedding=embeddings)

print(f"üì§ Uploading {len(chunks)} documents...")
uuids = [str(uuid4()) for _ in range(len(chunks))]
vector_store.add_documents(documents=chunks, ids=uuids)

print("‚úÖ Success! Data uploaded.")

üöÄ Loading & Splitting...
üß† Initializing Embeddings...
‚úÖ Index 'rag-assignment-index' ready.
üîå Connecting to Index...
üì§ Uploading 10 documents...
‚úÖ Success! Data uploaded.


In [18]:
import os
from dotenv import load_dotenv
from pinecone import Pinecone
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore

# 1. SETUP
load_dotenv()
api_key = os.getenv("PINECONE_API_KEY")
index_name = "rag-assignment-index"

# 2. VALIDATION PHASE 1: The Infrastructure Audit (Stats)
print("üîç Checking Pinecone Index Stats...")
pc = Pinecone(api_key=api_key)
index = pc.Index(index_name)
stats = index.describe_index_stats()

print(f"   ‚Ä¢ Dimensions: {stats['dimension']}")
print(f"   ‚Ä¢ Total Vectors: {stats['total_vector_count']}")
print(f"   ‚Ä¢ Pod Type: {stats.get('index_type', 'Serverless')}")

# FAIL CONDITION 1: Dimensions wrong
if stats['dimension'] != 384:
    print("‚ùå CRITICAL ERROR: Index dimension is wrong! You cannot use this with MiniLM.")
else:
    print("‚úÖ Dimensions match Model (384).")

# FAIL CONDITION 2: Empty Index
if stats['total_vector_count'] == 0:
    print("‚ö†Ô∏è WARNING: Index is empty. Did the upload finish?")
else:
    print("‚úÖ Data is present.")

# 3. VALIDATION PHASE 2: The Logic Test (Semantic Search)
print("\nüß† Testing Semantic Retrieval...")
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vector_store = PineconeVectorStore(index_name=index_name, embedding=embeddings)

query = "What is the motive of the speech?"
results = vector_store.similarity_search(query, k=1)

if len(results) > 0:
    print(f"‚úÖ Retrieval Success! Found: {len(results)} matches.")
    print(f"   ‚Ä¢ Top Result: \"{results[0].page_content[:100]}...\"")
    print(f"   ‚Ä¢ Source Metadata: {results[0].metadata}")
else:
    print("‚ùå Retrieval Failed: No results returned.")

üîç Checking Pinecone Index Stats...
   ‚Ä¢ Dimensions: 384
   ‚Ä¢ Total Vectors: 10
   ‚Ä¢ Pod Type: Serverless
‚úÖ Dimensions match Model (384).
‚úÖ Data is present.

üß† Testing Semantic Retrieval...
‚úÖ Retrieval Success! Found: 1 matches.
   ‚Ä¢ Top Result: "Just because we fight without rancor and without selfish object, seeking nothing for ourselves but w..."
   ‚Ä¢ Source Metadata: {'source': './files/speech.txt'}
