# DETI AI Bot (FastAPI + HTML5)
This notebook runs a complete web application.
1. **Crawl** `ua.pt/pt/deti`
2. **Index** into Qdrant.
3. **Serve** an API and a Chat UI on `http://localhost:8000`.

In [15]:
import time
import requests
import qdrant_client
from llama_index.core import VectorStoreIndex, Settings, StorageContext
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Document
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.readers.web import BeautifulSoupWebReader
from tqdm.notebook import tqdm

from pprint import pprint

from collections import deque
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# --- 1. SETUP & CRAWL ---
COLLECTION_NAME = "UA Bot"
LLM_MODEL="gemma3:latest"
EMB_MODEL="nomic-embed-text"
START_URL = "https://en.wikipedia.org/wiki/University_of_Aveiro"
MAX_DEPTH = 2        # Requirement 2: Defined Max Depth
MAX_PAGES = 15       # Limit to prevent infinite crawling


Settings.llm = Ollama(model=LLM_MODEL, request_timeout=120.0,temperature=0.1,
system_prompt="You are a strict assistant. Answer ONLY based on the context provided.")
Settings.embed_model = OllamaEmbedding(model_name=EMB_MODEL)
Settings.text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=50)

# Clean DB
client = qdrant_client.QdrantClient(url="http://localhost:6333")
if client.collection_exists(COLLECTION_NAME):
    client.delete_collection(COLLECTION_NAME)

vector_store = QdrantVectorStore(client=client, collection_name=COLLECTION_NAME)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

HEADERS = {
    'User-Agent': 'DETI_Student_Demo_Bot/1.0 (student_project_demo; contact: your_email@example.com)'
}

In [16]:
urls_to_index = []
visited = set()
seen_in_queue = {START_URL}

print(f"üï∑Ô∏è Starting Crawl from {START_URL}...")

queue = deque([(START_URL, 0)])

with tqdm(total=MAX_PAGES, desc="Crawling URLs") as pbar:
    while queue:
        if len(urls_to_index) >= MAX_PAGES:
            break

        url, current_depth = queue.popleft()
        if current_depth > MAX_DEPTH and url in visited:
            continue

        try:
            time.sleep(0.25)
            r = requests.get(url, headers=HEADERS, timeout=5)

            # Skip non-HTML
            if r.status_code != requests.codes.ok:
                continue

            # Add to Index List
            visited.add(url)
            urls_to_index.append(url)
            pbar.update(1)

            if current_depth >= MAX_DEPTH:
                continue

            soup = BeautifulSoup(r.text, 'html.parser')
            for a in soup.find_all('a', href=True):
                full = urljoin(url, a['href']).split('#')[0]

                if ("en.wikipedia.org/wiki/" in full
                    and "Aveiro" in full
                    and ":" not in a['href']
                    and full not in visited
                    and full not in seen_in_queue):

                    seen_in_queue.add(full)
                    queue.append((full, current_depth + 1))
        except Exception as e:
            print(e)

print(f"Found {len(urls_to_index)} pages.")
pprint(urls_to_index)

üï∑Ô∏è Starting Crawl from https://en.wikipedia.org/wiki/University_of_Aveiro...


Crawling URLs:   0%|          | 0/15 [00:00<?, ?it/s]

Found 13 pages.
['https://en.wikipedia.org/wiki/University_of_Aveiro',
 'https://en.wikipedia.org/wiki/Aveiro,_Portugal',
 'https://en.wikipedia.org/wiki/Regi%C3%A3o_de_Aveiro',
 'https://en.wikipedia.org/wiki/Aveiro_District',
 'https://en.wikipedia.org/wiki/Duchy_of_Aveiro',
 'https://en.wikipedia.org/wiki/Diocese_of_Aveiro',
 'https://en.wikipedia.org/wiki/Aveiro_(district)',
 'https://en.wikipedia.org/wiki/Ria_de_Aveiro',
 'https://en.wikipedia.org/wiki/Cathedral_of_Aveiro',
 'https://en.wikipedia.org/wiki/Ovos_Moles_de_Aveiro',
 'https://en.wikipedia.org/wiki/Universidade_de_Aveiro',
 'https://en.wikipedia.org/wiki/Est%C3%A1dio_Municipal_de_Aveiro',
 'https://en.wikipedia.org/wiki/Aveiro_Lagoon']


In [None]:
docs = []

# Ensure we process exactly what the crawler found
for url in tqdm(urls_to_index, desc="Indexing Content"):
    try:
        # 1. Manual Fetch with Headers
        r = requests.get(url, headers=HEADERS, timeout=5)
        if r.status_code != 200:
            continue

        # 2. Extract Text & Metadata
        soup = BeautifulSoup(r.text, 'html.parser')

        # Extract Text (Clean)
        paragraphs = soup.find_all('p')
        clean_text = "\n".join([p.get_text() for p in paragraphs])

        if len(clean_text) < 50:
            continue

        # Extract Metadata Fields
        title = soup.title.string if soup.title else "No Title"
        h1 = soup.find('h1').get_text() if soup.find('h1') else "No Header"
        lang = soup.html.get('lang', 'en')

        # 3. Create Document with Rich Metadata
        d = Document(
            text=clean_text,
            metadata={
                "url": url,
                "title": title.strip(),
                "header": h1.strip(),
                "language": lang,
                "content_length": len(clean_text)
            }
        )
        docs.append(d)

    except Exception as e:
        print(f"Failed to index {url}: {e}")

# Build Index
if docs:
    index = VectorStoreIndex.from_documents(docs, storage_context=storage_context)
    query_engine = index.as_query_engine(similarity_top_k=3)
    print(f"‚úÖ Successfully Indexed {len(docs)} documents with rich metadata.")
else:
    print("‚ùå No documents indexed.")

Indexing Content:   0%|          | 0/13 [00:00<?, ?it/s]

‚úÖ Successfully Indexed 13 documents with rich metadata.


In [18]:
# --- DEBUG CELL ---
question = "When was the university of Aveiro founded?"
print(f"‚ùì Question: {question}")

# Ask the engine
response = query_engine.query(question)
print(f"ü§ñ Answer: {response}\n")

print("--- WHAT THE LLM ACTUALLY SAW (Top Match) ---")
if response.source_nodes:
    # Print the text of the top retrieved chunk
    top_chunk = response.source_nodes[0].node.get_text()
    print(top_chunk[:1000] + "...") # Print first 1000 chars
else:
    print("‚ùå No documents were retrieved! The database might be empty or search failed.")

‚ùì Question: When was the university of Aveiro founded?
ü§ñ Answer: The University of Aveiro was founded in 1973.

--- WHAT THE LLM ACTUALLY SAW (Top Match) ---
The rectory (built by Gon√ßalo Byrne) is located in a white building near the campus centre.

The campus also has an entire Administration and Accounting Institute, which has its own service facilities and parking lots.

The projects developed by UA are developed under 20 research centres, of many different scientific areas:

As a research-led institution, during 2015, 316 research and technology transfer projects were active in UA. 80 of these projects are/were funded by International and European Programmes, of which 27 by the 7th Framework Programme, 13 by the Horizon 2020 and 17 by the ERASMUS +.

40¬∞37‚Ä≤49‚Ä≥N 8¬∞39‚Ä≤27‚Ä≥WÔªø / Ôªø40.6303¬∞N 8.6575¬∞WÔªø / 40.6303; -8.6575...


In [None]:
# --- 3. SERVER LOGIC (FLASK VERSION) ---
from flask import Flask, request, jsonify, make_response
import os
import threading

# Initialize Flask
app = Flask(__name__)

# Route for the UI
@app.route("/")
def home():
    if not os.path.exists("index.html"):
        return "<h1>Error: index.html not found. Please create it next to the notebook.</h1>"

    # Read and serve the HTML file
    with open("index.html", "r", encoding="utf-8") as f:
        content = f.read()
    return content

# Route for the Chat API
@app.route("/api/chat", methods=["POST"])
def chat():
    try:
        data = request.json
        question = data.get("question", "")

        if not question:
            return jsonify({"answer": "Please ask a question.", "source": None})

        # Query the LlamaIndex Engine
        # Note: This runs synchronously, blocking only this request
        response = query_engine.query(question)

        # Extract source URL if available
        source = "Unknown"
        if response.source_nodes:
            source = response.source_nodes[0].node.metadata.get('url', 'Unknown')

        return jsonify({"answer": str(response), "source": source})

    except Exception as e:
        print(f"Error: {e}")
        return jsonify({"answer": "I encountered an error processing your request.", "source": "System"}), 500

# --- 4. RUN SERVER ---
print("üöÄ Server starting on http://localhost:8000")
print("Press 'Stop' in the notebook toolbar to shut it down.")

# We run Flask on port 8000.
# debug=False is important in notebooks to prevent reloading issues.
app.run(host="0.0.0.0", port=8000, debug=False)

üöÄ Server starting on http://localhost:8000
Press 'Stop' in the notebook toolbar to shut it down.
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:8000
 * Running on http://192.168.17.88:8000
Press CTRL+C to quit
127.0.0.1 - - [02/Feb/2026 12:57:42] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [02/Feb/2026 12:58:14] "POST /api/chat HTTP/1.1" 200 -
127.0.0.1 - - [02/Feb/2026 12:59:48] "POST /api/chat HTTP/1.1" 200 -
127.0.0.1 - - [02/Feb/2026 13:00:28] "POST /api/chat HTTP/1.1" 200 -
127.0.0.1 - - [02/Feb/2026 13:01:31] "POST /api/chat HTTP/1.1" 200 -
127.0.0.1 - - [02/Feb/2026 13:03:18] "POST /api/chat HTTP/1.1" 200 -
