In [24]:
import requests
from bs4 import BeautifulSoup

# List of epilepsy-related websites
urls = [
    "https://epilepsysociety.org.uk/",
    "https://www.epilepsy.com/",
    "https://www.who.int/news-room/fact-sheets/detail/epilepsy"
    "https://epilepsyontario.org/about-epilepsy/frequently-asked-questions/"

]

scraped_data = []

# Custom headers to mimic a real browser
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.google.com",
}

def scrape_website(url):
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()  # Raises an error for 4xx and 5xx status codes
        
        soup = BeautifulSoup(response.text, "lxml")
        paragraphs = soup.find_all("p")
        page_text = "\n".join([para.get_text() for para in paragraphs])

        return page_text
    except requests.RequestException as e:
        print(f"Error scraping {url}: {e}")
        return ""


# Scrape all URLs
for url in urls:
    scraped_content = scrape_website(url)
    if scraped_content:
        scraped_data.append(scraped_content)

# Save scraped text for later use
with open("epilepsy_data.txt", "w", encoding="utf-8") as f:
    f.write("\n\n".join(scraped_data))

print("Scraping completed and saved to epilepsy_data.txt")


Error scraping https://www.epilepsy.com/: 403 Client Error: Forbidden for url: https://www.epilepsy.com/
Error scraping https://www.who.int/news-room/fact-sheets/detail/epilepsyhttps://epilepsyontario.org/about-epilepsy/frequently-asked-questions/: 403 Client Error: Forbidden for url: https://www.who.int/news-room/fact-sheets/detail/epilepsyhttps://epilepsyontario.org/about-epilepsy/frequently-asked-questions/
Scraping completed and saved to epilepsy_data.txt


In [25]:
import chromadb
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document

# Load scraped text
with open("epilepsy_data.txt", "r", encoding="utf-8") as f:
    epilepsy_content = f.read()

# Split text into small chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
text_chunks = text_splitter.split_text(epilepsy_content)

# Convert text chunks to LangChain Document objects
documents = [Document(page_content=chunk) for chunk in text_chunks]

# Load Hugging Face embeddings (FREE)
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Initialize ChromaDB using `from_documents()`
vectorstore = Chroma.from_documents(documents, embedding_function, persist_directory="./chromadb")

print("Data successfully embedded and stored in ChromaDB.")


Data successfully embedded and stored in ChromaDB.
