In [1]:
pip install beautifulsoup4 requests sentence-transformers faiss-cpu openai transformers


Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m37.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [3]:
import time

code:


In [9]:
import requests
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
import faiss
import openai
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import re

# Step 1: Crawl and Scrape COntent of the website
def scrape_website(url):

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract text from all paragraphs, headers, so on excetra....
    paragraphs = soup.find_all(['p', 'h1', 'h2', 'h3', 'ul', 'li'])
    text = " ".join([para.get_text() for para in paragraphs])

    # Clean up any unnecessary spaces or special characters,using REgular EXPRESSIONS
    text = re.sub(r'\s+', ' ', text)

    return text

# Step 2: Chunking and Embedding Content
def chunk_and_embed(text, model, max_chunk_size=512):

    chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]
    embeddings = model.encode(chunks)
    return chunks, embeddings

# Step 3: Store Embeddings in a Vector Database (FAISS)
def store_embeddings(embeddings):

    db = faiss.IndexFlatL2(len(embeddings[0]))
    db.add(embeddings)
    return db

# Step 4: Query Handling - Search Similarity in the Vector Database
def query_vector_db(query, model, db, chunks, k=5):

    query_vector = model.encode([query])
    distances, indices = db.search(query_vector, k)
    results = [chunks[i] for i in indices[0]]
    return results

# Step 5: Response Generation Using GPT-2 (or similar LLM)
def generate_response(chunks, query, retries=3, backoff=5):

    # Prepare the prompt
    prompt = f"Based on the following content, answer the question: {query}\n\n"
    for chunk in chunks:
        prompt += f"- {chunk}\n"

    # Load the GPT-2 model and tokenizer
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model_name = "gpt2"
    model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
# or
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    # Set pad_token_id to eos_token_id for GPT-2
    model.config.pad_token_id = model.config.eos_token_id

    # Tokenize the prompt and handle the attention mask
    inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True, padding=True).to(device)

    # Retry logic for generating response
    for attempt in range(retries):
        try:
            # Generate response
            outputs = model.generate(
                inputs['input_ids'],
                attention_mask=inputs['attention_mask'],  # Attention mask is set
                max_new_tokens=200,  # Set max_new_tokens instead of max_length to avoid conflicts
                do_sample=True,      # Ensure sampling for diversity
                no_repeat_ngram_size=2,
                top_p=0.95,
                top_k=50,
                temperature=0.7,     # Adjust temperature for creativity
            )

            # Decode the generated response
            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
            return response

        except Exception as e:
            print(f"Error: {e}")
            time.sleep(backoff)  # Retry after a backoff period
            continue

    return "Unable to generate response due to an error."

# Example Usage
if __name__ == "__main__":
    url = "https://www.uchicago.edu/"  # URL of the website to scrape

    # Step 1: Scrape the website content
    scraped_content = scrape_website(url)

    # Step 2: Initialize the embedding model and process the scraped text
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    chunks, embeddings = chunk_and_embed(scraped_content, embedding_model)

    # Step 3: Store embeddings in the FAISS vector database
    faiss_db = store_embeddings(embeddings)

    # Step 4: Query the vector database
    query = "What is the University of Chicago known for?"
    relevant_chunks = query_vector_db(query, embedding_model, faiss_db, chunks)

    # Step 5: Generate a response using GPT-2 (or other LLM)
    response = generate_response(relevant_chunks, query)
    print("Response:", response)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Response: Based on the following content, answer the question: What is the University of Chicago known for?

- 20241217T024330Z-1577f6f44dbgtczfhC1ATL819w0000000h100000000092q4
- 20241217T024330Z-1577f6f44dbgtczfhC1ATL819w0000000h100000000092q4
- 20241217T024330Z-1577f6f44dbgtczfhC1ATL819w0000000h100000000092q4
- 20241217T024330Z-1577f6f44dbgtczfhC1ATL819w0000000h100000000092q4
- 20241217T024330Z-1577f6f44dbgtczfhC1ATL819w0000000h100000000092q4
 and we will find that the university is known as the "National University."
.
 - 20241301Z158850e2f1b8c4b2a838a0a4f8f9c3e1f7e3c1c2c64b1e7f2d65d3a9b5f5b6a7
The University is also known to be the home of the most famous and influential people in the world.
