In [57]:
from pinecone import Pinecone, ServerlessSpec
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core import StorageContext, VectorStoreIndex, Settings
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.llms.gemini import Gemini
from llama_index.readers.web import BeautifulSoupWebReader
import os
import getpass
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

In [58]:
# Set up Gemini API key
os.environ["GOOGLE_API_KEY"] = getpass.getpass("Gemini API Key: ")

In [77]:
ROOT_URL = "https://www.buffalo.edu/international-student-services.html"
ALLOWED_PREFIX = "https://www.buffalo.edu/international-student-services"

In [78]:
# Step 1: Crawl subpath and print discovered URLs
def get_subpath_links(start_url, allowed_prefix, delay=0.5):
    visited = set()
    to_visit = [start_url]
    all_links = set()

    print(f"\n🌐 Starting crawl from: {start_url}")
    print(f"🔒 Only accepting links starting with: {allowed_prefix}\n")

    while to_visit:
        url = to_visit.pop()
        if url in visited:
            continue
        visited.add(url)

        try:
            print(f"🔍 Crawling: {url}")
            response = requests.get(url, timeout=10)
            soup = BeautifulSoup(response.content, 'html.parser')

            for link_tag in soup.find_all('a', href=True):
                href = link_tag['href']
                full_url = urljoin(url, href)

                # Clean up URL (remove anchors/fragments)
                full_url = full_url.split("#")[0].split("?")[0]

                if full_url.startswith(allowed_prefix) and full_url not in visited and full_url not in all_links:
                    print(f"   ✅ Found: {full_url}")
                    all_links.add(full_url)
                    to_visit.append(full_url)
        except Exception as e:
            print(f"   ❌ Failed to crawl {url}: {e}")

    print(f"\n✅ Finished crawling. Total unique pages collected: {len(all_links)}\n")
    return list(all_links)

In [79]:
#Get URLs to crawl under subpath
urls = get_subpath_links(ROOT_URL, ALLOWED_PREFIX)


🌐 Starting crawl from: https://www.buffalo.edu/international-student-services.html
🔒 Only accepting links starting with: https://www.buffalo.edu/international-student-services

🔍 Crawling: https://www.buffalo.edu/international-student-services.html
   ✅ Found: https://www.buffalo.edu/international-student-services/about-us/contact-us.html
   ✅ Found: https://www.buffalo.edu/international-student-services/for-new-students.html
   ✅ Found: https://www.buffalo.edu/international-student-services/immigration-visa.html
   ✅ Found: https://www.buffalo.edu/international-student-services/social-security-card---income-tax-filing.html
   ✅ Found: https://www.buffalo.edu/international-student-services/life-in-buffalo.html
   ✅ Found: https://www.buffalo.edu/international-student-services/workshops-trips-events.html
   ✅ Found: https://www.buffalo.edu/international-student-services/about-us.html
   ✅ Found: https://www.buffalo.edu/international-student-services/han-nee-test.html
   ✅ Found: https:

In [80]:
#Load data
loader = BeautifulSoupWebReader()
documents = loader.load_data(urls=urls)

In [81]:
llm = Gemini()
embed_model = GeminiEmbedding(model_name="embedding-001")

  llm = Gemini()
  embed_model = GeminiEmbedding(model_name="embedding-001")


In [None]:
Settings.llm = llm
Settings.embed_model = embed_model
Settings.chunk_size = 512

#Setup Pinecone
pc = Pinecone(api_key="Enter Key")
index_name = "gemini-chatbot-2"

In [83]:
if index_name not in [idx.name for idx in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

In [84]:
pinecone_index = pc.Index(index_name)

#Store and Index
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)

Upserted vectors: 100%|██████████| 923/923 [00:04<00:00, 229.06it/s]


In [99]:
query_engine = index.as_query_engine(similarity_top_k=5)

# Query the index, send the context to Gemini, and wait for the response
gemini_response = query_engine.query("How to I talk with iss?")

In [100]:
print(gemini_response)

International Student Services is located at 210 Talbert Hall, Buffalo, NY 14260-1604. They can be reached by phone at (716) 645-2258 or by email at iss@buffalo.edu.

