In [1]:
# 🚀 RAG Pipeline with Real HTML Page as Knowledge Base
# 🌐 Using mentalhealth.org as example source

# Step 1: Install Dependencies
!pip install requests beautifulsoup4 sentence-transformers faiss-cpu transformers --quiet

# Step 2: Import Libraries
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from transformers import pipeline

print("✅ Libraries imported!")

# Step 3: Scrape and Clean Text from mentalhealth.org
url = "https://www.mentalhealth.org.uk/explore-mental-health/publications/understanding-anxiety"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# Extract visible text only
paragraphs = [p.get_text(strip=True) for p in soup.find_all('p') if p.get_text(strip=True)]

# Optional: Preview a few paragraphs
print("\n📝 Sample extracted paragraphs:")
for para in paragraphs[:3]:
    print("-", para)

# Step 4: Prepare Knowledge Base
knowledge_base = paragraphs

# Step 5: Vectorize Knowledge Base
model = SentenceTransformer('all-MiniLM-L6-v2')
kb_embeddings = model.encode(knowledge_base, convert_to_tensor=False)

# Step 6: Create FAISS Index
index = faiss.IndexFlatL2(kb_embeddings[0].shape[0])
index.add(np.array(kb_embeddings))
print(f"✅ FAISS index built with {index.ntotal} documents!")

# Step 7: Simulate User Input
user_input = "I'm struggling with anxiety and racing thoughts. What can I do?"

# Step 8: Retrieve Relevant Info
query_embedding = model.encode([user_input])
distances, indices = index.search(np.array(query_embedding), k=3)

retrieved_docs = [knowledge_base[idx] for idx in indices[0]]

print("\n🔍 Retrieved documents:")
for doc in retrieved_docs:
    print("-", doc)

# Step 9: Generate Answer using Retrieved Context
generator = pipeline("text-generation", model="distilgpt2")

context = " ".join(retrieved_docs)
prompt = f"Context: {context}\nQuestion: What can I do if I'm struggling with anxiety?\nAnswer:"

response = generator(prompt, max_length=100, num_return_sequences=1)

print("\n💡 Chatbot Suggestion:")
print(response[0]['generated_text'])

# Step 10: Export Retrieved Chunks and Answer
with open("rag_html_retrieved.txt", "w") as f:
    f.write("Retrieved Documents:\n")
    for doc in retrieved_docs:
        f.write(f"- {doc}\n")
    f.write("\nGenerated Answer:\n")
    f.write(response[0]['generated_text'])

print("\n✅ RAG pipeline completed! Retrieved documents and answer exported to 'rag_html_retrieved.txt'")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ FAISS index built with 3 documents!

🔍 Retrieved documents:
- Home of Mental Health Awareness Week
- Alternatively, please go to ourHome pageor use the search box to find what you're looking for.
- We're sorry, we can't find that page.


config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



💡 Chatbot Suggestion:
Context: Home of Mental Health Awareness Week Alternatively, please go to ourHome pageor use the search box to find what you're looking for. We're sorry, we can't find that page.
Question: What can I do if I'm struggling with anxiety?
Answer: A lot of the time, we talk about it in our talk. When someone is anxious of a new thing to do, we usually want someone to talk about it and they need to think clearly. When making a

✅ RAG pipeline completed! Retrieved documents and answer exported to 'rag_html_retrieved.txt'
