# SITAFAL_TASK-2

# CHAT WITH WEBSITE USING RAG PIPELINE

# INITIAL IMPORT THE MODULES

In [None]:
import requests
from bs4 import BeautifulSoup

In [4]:
from sentence_transformers import SentenceTransformer




In [5]:
import faiss
import numpy as np
import os

# FUNCTIONS TO EXTRACT THE DATA FROM WEBSITE

# Scrape website content
def scrape_website(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    text = soup.get_text(separator="\n")
    return text

# Chunk text for embeddings
def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

# Generate embeddings using a pre-trained model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
def generate_embeddings(chunks):
    return embedding_model.encode(chunks)

# Store embeddings in a vector database
def store_embeddings(chunks, embeddings):
    vector_dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(vector_dim)
    index.add(embeddings)
    metadata = {i: chunks[i] for i in range(len(chunks))}
    return index, metadata

In [7]:
def query_database(query, index, metadata, top_k=5):
    query_embedding = embedding_model.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    results = []
    for idx in indices[0]:
        if idx != -1:  # Ignore invalid indices
            
            results.append(metadata[idx])
    if not results:
        results.append("No relevant information found for the query.")
    return results

In [8]:
import openai


# API_KEY

In [13]:
openai.api_key = "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"

In [14]:
def generate_response(query, retrieved_chunks):
    if not retrieved_chunks:  # Handle the case where there are no relevant chunks
        return "No relevant information retrieved to answer the query."
    
    context = "\n".join(retrieved_chunks)
    prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"
    
    # Call the OpenAI Completion endpoint
    response = openai.ChatCompletion.create(model="gpt-3.5-turbo",  # Use "gpt-4" if you want
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=200,
        temperature=0.7
    
        
    )
    
    # Extract the response
    return response['choices'][0]['message']['content'].strip()

In [15]:
def rag_pipeline(url, user_query):
    # Step 1: Scrape website
    text = scrape_website(url)
    if not text.strip():
        return "No text content could be extracted from the website."

    # Step 2: Chunk and embed
    chunks = chunk_text(text)
    if not chunks:
        return "The website content could not be chunked properly."
    
    embeddings = generate_embeddings(chunks)
    index, metadata = store_embeddings(chunks, np.array(embeddings))
    
    # Step 3: Query handling
    if index.ntotal == 0:
        return "No embeddings were added to the vector database."
    
    relevant_chunks = query_database(user_query, index, metadata)
    
    # Step 4: Response generation
    response = generate_response(user_query, relevant_chunks)
    return response

In [None]:
if __name__ == "__main__":
    website_url = "https://www.washington.edu/" # Replace with a target website
    user_query = "What information does this website provide?"
    result = rag_pipeline(website_url, user_query)
    print("Generated Response:", result)