In [4]:
pip install sentence-transformers faiss-cpu requests beautifulsoup4




In [5]:
from sentence_transformers import SentenceTransformer
import faiss
import requests
from bs4 import BeautifulSoup


In [6]:
def crawl_and_scrape(urls):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    content = []
    for url in urls:
        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            text = ' '.join([p.get_text() for p in soup.find_all('p')])
            content.append(text)
        except Exception as e:
            print(f"Error scraping {url}: {e}")
    return ' '.join(content)


In [7]:
def chunk_text(text, chunk_size=100):
    words = text.split()
    return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]


In [8]:
def store_embeddings(chunks, model):
    embeddings = model.encode(chunks)
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index, chunks


In [9]:
def query_vector_database(query, index, model, chunks, top_k=3):
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    return [chunks[i] for i in indices[0]]


In [10]:
def generate_response(results):
    return "\n".join(results)


In [12]:


def main():
    # Step 1: Define URLs
    urls = [
        "https://www.uchicago.edu/",
        "https://www.washington.edu/",
        "https://www.stanford.edu/",
        "https://und.edu/",
    ]

    # Step 2: Initialize embedding model
    print("Loading embedding model...")
    model = SentenceTransformer("all-MiniLM-L6-v2")  # Pre-trained model

    # Step 3: Crawl and scrape websites
    print("Crawling and scraping websites...")
    website_data = crawl_and_scrape(urls)

    # Step 4: Chunk and embed content
    print("Chunking and embedding content...")
    chunks = chunk_text(website_data)
    index, stored_chunks = store_embeddings(chunks, model)

    # Step 5: Handle user query
    query = input("Enter your query: ")
    print("Searching content...")
    results = query_vector_database(query, index, model, stored_chunks)

    # Step 6: Generate and display response
    print("Generating response...")
    response = generate_response(results)
    print(response)

if __name__ == "__main__":
    main()


Loading embedding model...
Crawling and scraping websites...
Chunking and embedding content...
Enter your query: computer science
Searching content...
Generating response...
on-campus and online degrees. Explore the causes and impact of criminal behavior and prepare to play a key role in criminal justice. Immerse yourself in teaching and prepare for a rewarding career educating grades 1-8. Join a UAS ecosystem that puts you on the cutting edge of what’s next in the industry. A high-demand field, you'll drive positive business decisions with meaningful data. Our students embark on a journey to become leaders. Not just in what they say, but what they do. Through hard work and determination, we believe in work worth doing. Student has dream of being like the
arts foundation and deep subject-area expertise Undergraduate Education Unsurpassed opportunities to participate in the advancement of entire fields of knowledge Graduate education Continuing adult education, executive and professiona