In [25]:
import os
import time
import textwrap
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from langchain_groq import ChatGroq
from langchain.docstore.document import Document 
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import WebBaseLoader
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from dotenv import load_dotenv

In [2]:
# Load environment variables from .env file (instead of Streamlit secrets)
load_dotenv()

# Load API keys from environment variables
groq_api_key = os.getenv("GROQ_API_KEY")
google_api_key = os.getenv("GOOGLE_API_KEY")

In [7]:
# Initialize LLM
llm = ChatGroq(
    groq_api_key=groq_api_key,
    model_name="mixtral-8x7b-32768"
)

In [3]:
# Define the prompt template
prompt = ChatPromptTemplate.from_template(
    """
    You are a Webpage Assistant that helps users to find information in a context.
    Please provide the most accurate response based on the context and inputs.
    Only give information that is in the context, not in general.
    <context>
    {context}
    </context>
    Questions:{input}
    """
)

In [32]:
def extract_links(homepage_url):
    try:
        # Fetch the webpage content
        response = requests.get(homepage_url)
        response.raise_for_status()  # Check if the request was successful

        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all anchor tags (links) on the page
        links = soup.find_all('a', href=True)

        # Extract href attribute from each link and join with the homepage URL
        all_links = set()
        homepage_domain = urlparse(homepage_url).netloc  # Extract domain from homepage URL

        for link in links:
            href = link['href']

            # Filter out non-HTTP/HTTPS and non-relative links
            if href.startswith(('http://', 'https://')):
                # Check if the link belongs to the same domain
                link_domain = urlparse(href).netloc
                if link_domain == homepage_domain:
                    full_link = urljoin(homepage_url, href)  # Join with the base URL if needed
                    all_links.add(full_link)

            elif href.startswith('/'):  # Handle relative URLs
                full_link = urljoin(homepage_url, href)
                all_links.add(full_link)

        return all_links

    except requests.exceptions.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return set()

# Function to process the entire website, including the homepage URL
def vector_embedding(webpage_link):
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    all_links = extract_links(webpage_link)
    
    if webpage_link not in all_links:
        all_links.add(webpage_link)
    
    # Initialize the list of final_documents
    final_documents = []
    
    # Iterate over all links to load and process the documents
    for link in all_links:
        loader = WebBaseLoader(link)  # Load webpage
        docs = loader.load()  # Document loading
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)  # Chunk creation

        # Split the documents into chunks and append each as a Document
        for doc in text_splitter.split_documents(docs):  # Splitting
            # Ensure 'doc' is a string before passing it to Document
            doc_str = str(doc)  # Convert to string if necessary
            document = Document(page_content=doc_str)  # Wrap each chunk into a Document object
            final_documents.append(document)

    # Now that all documents are wrapped with 'page_content', create vectors
    vectors = FAISS.from_documents(final_documents, embeddings)  # Vector embeddings
    return vectors
    

# Input for webpage link
webpage_link = "https://nijhoom.com/"

# Process the webpage to create embeddings
vectors = vector_embedding(webpage_link)
print("Vector Store DB is ready.")

Vector Store DB is ready.


In [34]:
# Input for the question
prompt1 = "what are the holiday packages?"

if prompt1:
    document_chain = create_stuff_documents_chain(llm, prompt)
    retriever = vectors.as_retriever()
    retrieval_chain = create_retrieval_chain(retriever, document_chain)
    start = time.process_time()
    response = retrieval_chain.invoke({'input': prompt1})
    print(f"Response time: {time.process_time() - start} seconds")
    wrapped_text = textwrap.fill(response['answer'], width=160)
    print(wrapped_text)

else:
    print("Please enter a valid question.")

Response time: 0.02803059300000399 seconds
Based on the provided context, the Bangladesh Holiday Packages refer to the arrival to departure Bangladesh Holiday & Vacation Packages listed on the page.
These packages include accommodations suitable for Western travelers, with complimentary breakfast and airport transfers. They are available to book anytime
outside the schedule on a private basis for a group of two travelers. Hotel upgrades in Dhaka and other available places can be provided. The price for the last
night's accommodation will be deducted if you depart on an evening flight. Off-season discounts are available from April to September, and an early-bird
discount is offered for the first two travelers to book any scheduled tours six months in advance. Free cancellation up to 45 days before the tour and Covid-19
protection are also provided for these holiday packages in Bangladesh. For meals, dinner is not included, but breakfast is provided at the hotel or a local
restaurant. Al