In [1]:
%pip install wikipedia

Note: you may need to restart the kernel to use updated packages.


In [None]:
import wikipedia
from dotenv import load_dotenv
import os
import json
import requests
from langchain_community.vectorstores import UpstashVectorStore
from langchain_core.documents import Document
import ollama
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
import traceback


In [3]:
load_dotenv()

UPSTASH_VECTOR_REST_URL = os.getenv("UPSTASH_VECTOR_REST_URL")
UPSTASH_VECTOR_REST_TOKEN = os.getenv("UPSTASH_VECTOR_REST_TOKEN")

print(f"URL configured: {UPSTASH_VECTOR_REST_URL is not None}")
print(f"Token configured: {UPSTASH_VECTOR_REST_TOKEN is not None}")
if UPSTASH_VECTOR_REST_URL:
    print(f"URL: {UPSTASH_VECTOR_REST_URL[:50]}...")

URL configured: True
Token configured: True
URL: https://refined-dog-19948-us1-vector.upstash.io...


In [5]:
# Test Upstash connection first
def test_upstash_connection():
    """Test if Upstash API is accessible and returns valid JSON"""
    try:
        headers = {
            'Authorization': f'Bearer {UPSTASH_VECTOR_REST_TOKEN}',
            'Content-Type': 'application/json'
        }
        
        # Test with info endpoint
        response = requests.get(f"{UPSTASH_VECTOR_REST_URL}/info", headers=headers, timeout=10)
        
        print(f"Status Code: {response.status_code}")
        print(f"Response Headers: {dict(response.headers)}")
        print(f"Raw Response: {response.text[:500]}...")
        
        if response.status_code == 200:
            try:
                data = response.json()
                print(f"✅ Upstash connection successful!")
                print(f"Index info: {data}")
                return True, data
            except json.JSONDecodeError as e:
                print(f"❌ JSONDecodeError: {e}")
                print(f"Response is not valid JSON: {response.text}")
                return False, None
        else:
            print(f"❌ HTTP Error {response.status_code}: {response.text}")
            return False, None
            
    except requests.exceptions.RequestException as e:
        print(f"❌ Request failed: {e}")
        return False, None
    except Exception as e:
        print(f"❌ Unexpected error: {e}")
        traceback.print_exc()
        return False, None

# Test the connection
connection_ok, index_info = test_upstash_connection()

Status Code: 200
Response Headers: {'Date': 'Fri, 05 Sep 2025 13:40:39 GMT', 'Content-Type': 'application/json', 'Content-Length': '411', 'Connection': 'keep-alive', 'Strict-Transport-Security': 'max-age=31536000; includeSubDomains'}
Raw Response: {
  "result" : {
    "vectorCount" : 0,
    "pendingVectorCount" : 0,
    "indexSize" : 0,
    "dimension" : 768,
    "similarityFunction" : "COSINE",
    "namespaces" : {
      "" : {
        "vectorCount" : 0,
        "pendingVectorCount" : 0
      }
    },
    "indexType" : "DENSE",
    "denseIndex" : {
      "dimension" : 768,
      "similarityFunction" : "COSINE",
      "embeddingModel" : ""
    }
  }
}...
✅ Upstash connection successful!
Index info: {'result': {'vectorCount': 0, 'pendingVectorCount': 0, 'indexSize': 0, 'dimension': 768, 'similarityFunction': 'COSINE', 'namespaces': {'': {'vectorCount': 0, 'pendingVectorCount': 0}}, 'indexType': 'DENSE', 'denseIndex': {'dimension': 768, 'similarityFunction': 'COSINE', 'embeddingModel': '

In [6]:
# Only proceed if connection is working
if not connection_ok:
    print("\n🚨 UPSTASH CONNECTION FAILED!")
    print("\nPossible solutions:")
    print("1. Check your UPSTASH_VECTOR_REST_URL and UPSTASH_VECTOR_REST_TOKEN in .env")
    print("2. Ensure your Upstash Vector index exists and is active")
    print("3. Create a new index at https://console.upstash.com with:")
    print("   - Dimension: 768 (for nomic-embed-text)")
    print("   - Embedding model enabled")
    print("4. Check if your index region is accessible")
    raise Exception("Cannot proceed without valid Upstash connection")
else:
    print("✅ Upstash connection verified, proceeding...")

✅ Upstash connection verified, proceeding...


In [7]:
# Initialize embeddings
try:
    embeddings = OllamaEmbeddings(model="nomic-embed-text")
    
    # Test embedding generation
    test_text = "This is a test embedding"
    test_embedding = embeddings.embed_query(test_text)
    
    print(f"✅ Embeddings working!")
    print(f"Embedding dimension: {len(test_embedding)}")
    print(f"Sample values: {test_embedding[:5]}")
    
    # Check if dimensions match index
    if index_info and 'dimension' in index_info:
        index_dim = index_info['dimension']
        if len(test_embedding) != index_dim:
            print(f"⚠️  DIMENSION MISMATCH!")
            print(f"Embedding dimension: {len(test_embedding)}")
            print(f"Index dimension: {index_dim}")
            print(f"You need to create a new index with dimension {len(test_embedding)}")
        else:
            print(f"✅ Dimensions match: {len(test_embedding)}")
    
except Exception as e:
    print(f"❌ Embedding error: {e}")
    traceback.print_exc()
    raise

✅ Embeddings working!
Embedding dimension: 768
Sample values: [0.03040738, 0.056143694, -0.1795196, -0.080891974, 0.04595056]


In [8]:
# Initialize Upstash Vector Store with error handling
try:
    store = UpstashVectorStore(
        embedding=embeddings,
        index_url=UPSTASH_VECTOR_REST_URL,
        index_token=UPSTASH_VECTOR_REST_TOKEN,
    )
    print("✅ UpstashVectorStore initialized successfully")
    
except Exception as e:
    print(f"❌ Failed to initialize UpstashVectorStore: {e}")
    traceback.print_exc()
    raise

✅ UpstashVectorStore initialized successfully


In [9]:
# Test with a single document first
try:
    print("Testing with a single document...")
    test_doc = Document(
        page_content="This is a test document to verify the Upstash integration works correctly.",
        metadata={"source": "test", "title": "Test Document"}
    )
    
    # Try to add the test document
    result = store.add_documents([test_doc])
    print(f"✅ Test document added successfully! Result: {result}")
    
    # Try to search
    search_results = store.similarity_search("test document", k=1)
    print(f"✅ Search successful! Found {len(search_results)} results")
    
except json.JSONDecodeError as e:
    print(f"❌ JSONDecodeError during test: {e}")
    print("This usually means:")
    print("1. Your index dimension doesn't match embedding dimension (768)")
    print("2. Your index wasn't created with embedding model support")
    print("3. API endpoint is returning HTML error page instead of JSON")
    print("\nSolution: Create a new Upstash Vector index with:")
    print("- Dimension: 768")
    print("- Embedding model enabled")
    raise
    
except Exception as e:
    print(f"❌ Test failed: {e}")
    traceback.print_exc()
    raise

Testing with a single document...
✅ Test document added successfully! Result: ['30d620e8-fc57-4318-9cec-c85be51baf36']
✅ Search successful! Found 0 results


In [10]:
# If test passed, proceed with Wikipedia documents
print("\n🚀 Test passed! Proceeding with Wikipedia documents...")

# Fetch Wikipedia pages
documents = []
cities = ["New York City, New York", "Boise, Idaho"]

for city in cities:
    try:
        print(f"Fetching: {city}")
        wikipedia_page_result = wikipedia.page(title=city)
        doc = Document(
            page_content=wikipedia_page_result.content,
            metadata={
                "source": f"{wikipedia_page_result.url}",
                "title": city,
            }
        )
        documents.append(doc)
        print(f"✅ Loaded: {city} ({len(wikipedia_page_result.content)} chars)")
    except Exception as e:
        print(f"❌ Error loading {city}: {e}")

print(f"\nTotal documents loaded: {len(documents)}")


🚀 Test passed! Proceeding with Wikipedia documents...
Fetching: New York City, New York
✅ Loaded: New York City, New York (93891 chars)
Fetching: Boise, Idaho
✅ Loaded: Boise, Idaho (51096 chars)

Total documents loaded: 2


In [11]:
# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100
)

docs = text_splitter.split_documents(documents)
print(f"Split into {len(docs)} chunks")

# Show sample chunk
if docs:
    print(f"\nSample chunk:")
    print(f"Content: {docs[0].page_content[:200]}...")
    print(f"Metadata: {docs[0].metadata}")

Split into 244 chunks

Sample chunk:
Content: New York, often called New York City (NYC), is the most populous city in the United States. It is located at the southern tip of New York State on one of the world's largest natural harbors. The city ...
Metadata: {'source': 'https://en.wikipedia.org/wiki/New_York_City', 'title': 'New York City, New York'}


In [12]:
# Add documents in small batches with comprehensive error handling
batch_size = 5  # Small batches to avoid timeouts
total_inserted = 0
failed_batches = []

try:
    for i in range(0, len(docs), batch_size):
        batch = docs[i:i+batch_size]
        batch_num = i//batch_size + 1
        total_batches = (len(docs)-1)//batch_size + 1
        
        print(f"\nProcessing batch {batch_num}/{total_batches} ({len(batch)} documents)")
        
        try:
            # Add batch with timeout handling
            inserted_vectors = store.add_documents(batch)
            total_inserted += len(batch)
            print(f"✅ Batch {batch_num} inserted successfully. Total: {total_inserted}/{len(docs)}")
            
        except json.JSONDecodeError as e:
            print(f"❌ JSONDecodeError in batch {batch_num}: {e}")
            failed_batches.append(batch_num)
            print("This indicates a dimension mismatch or index configuration issue.")
            break  # Stop processing if we get JSON errors
            
        except Exception as e:
            print(f"❌ Error in batch {batch_num}: {e}")
            failed_batches.append(batch_num)
            # Continue with next batch
            continue
    
    if failed_batches:
        print(f"\n⚠️  Some batches failed: {failed_batches}")
        print(f"Successfully inserted: {total_inserted}/{len(docs)} documents")
    else:
        print(f"\n🎉 All {total_inserted} documents indexed in Upstash successfully!")
        
except Exception as e:
    print(f"❌ Critical error during batch processing: {e}")
    traceback.print_exc()


Processing batch 1/49 (5 documents)
✅ Batch 1 inserted successfully. Total: 5/244

Processing batch 2/49 (5 documents)
✅ Batch 2 inserted successfully. Total: 10/244

Processing batch 3/49 (5 documents)
✅ Batch 3 inserted successfully. Total: 15/244

Processing batch 4/49 (5 documents)
✅ Batch 4 inserted successfully. Total: 20/244

Processing batch 5/49 (5 documents)
✅ Batch 5 inserted successfully. Total: 25/244

Processing batch 6/49 (5 documents)
✅ Batch 6 inserted successfully. Total: 30/244

Processing batch 7/49 (5 documents)
✅ Batch 7 inserted successfully. Total: 35/244

Processing batch 8/49 (5 documents)
✅ Batch 8 inserted successfully. Total: 40/244

Processing batch 9/49 (5 documents)
✅ Batch 9 inserted successfully. Total: 45/244

Processing batch 10/49 (5 documents)
✅ Batch 10 inserted successfully. Total: 50/244

Processing batch 11/49 (5 documents)
✅ Batch 11 inserted successfully. Total: 55/244

Processing batch 12/49 (5 documents)
✅ Batch 12 inserted successfully. T

In [13]:
# Test similarity search if any documents were inserted
if total_inserted > 0:
    try:
        print("\n🔍 Testing similarity search...")
        query = "What is the population of New York?"
        results = store.similarity_search(query, k=3)
        
        print(f"Query: {query}")
        print(f"Found {len(results)} results:")
        
        for i, result in enumerate(results):
            print(f"\nResult {i+1}:")
            print(f"Source: {result.metadata.get('source', 'Unknown')}")
            print(f"Title: {result.metadata.get('title', 'Unknown')}")
            print(f"Content: {result.page_content[:200]}...")
            
    except Exception as e:
        print(f"❌ Search failed: {e}")
        traceback.print_exc()
else:
    print("\n❌ No documents were successfully inserted, skipping search test.")


🔍 Testing similarity search...
Query: What is the population of New York?
Found 3 results:

Result 1:
Source: https://en.wikipedia.org/wiki/New_York_City
Title: New York City, New York
Content: Based on data from the 2020 census, New York City comprised about 43.6% of the state's population of 20,202,320, and about 39% of the population of the New York metropolitan area. The majority of New ...

Result 2:
Source: https://en.wikipedia.org/wiki/New_York_City
Title: New York City, New York
Content: New York City is the most populous city in the United States, with 8,804,190 residents as of the 2020 census, its highest decennial count ever, incorporating more immigration into the city than outmig...

Result 3:
Source: https://en.wikipedia.org/wiki/New_York_City
Title: New York City, New York
Content: Between 2010 and 2020, New York City’s population grew by 629,000 residents, more than the total growth of the next four largest American cities (Los Angeles, Chicago, Houston, and Phoenix) c

In [15]:
retriever = store.as_retriever(search_type="similarity", search_kwargs={"k": 3})
retriever.invoke("what is the  named after  trees?")

[Document(metadata={'source': 'https://en.wikipedia.org/wiki/Boise,_Idaho', 'title': 'Boise, Idaho'}, page_content='=== Etymology ===\n\nThe origin of the name is uncertain. One account credits Capt. B. L. E. Bonneville of the U.S. Army as its source. After trekking for weeks through dry and rough terrain, his exploration party reached an overlook with a view of the Boise River Valley. The place where they stood is called Bonneville Point, located on the Oregon Trail east of the city. According to the story, a French-speaking guide, overwhelmed by the sight of the verdant river, yelled "Les bois! Les bois!" ("The woods! The woods!")—and the name stuck.\nThe name may also derive from earlier mountain men who named the river that flows through the city. In the 1820s, French Canadian fur trappers associated with the British-owned Hudson\'s Bay Company set trap lines in the vicinity. Set in a high-desert area, the tree-lined valley of the Boise River became a distinct landmark, an oasis do

In [16]:
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [20]:
llm = ChatOllama(model="llama3", temperature=0)

In [21]:

message = """
Answer this question using the provided context only.

{question}

Context:
{context}
"""

In [22]:
prompt = ChatPromptTemplate.from_messages([("human", message)])

In [23]:
parser = StrOutputParser()

chain = {"context": retriever, "question": RunnablePassthrough()} | prompt | llm | parser


In [25]:
response = chain.invoke("What type of food is best known in New York City?")
print(response)

According to the provided context, bagels are best known in New York City.
