# Summarise websites
1. website scraper
2. data embedding 
3. database storage
4. data feeding to the LLM
5. Generate output

In [10]:
import requests
from bs4 import BeautifulSoup
from langchain_community.embeddings import OllamaEmbeddings
from dotenv import load_dotenv
import os
from pinecone import Pinecone as PineconeClient
from langchain_pinecone import PineconeVectorStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain_community.chat_models import ChatOllama
from langchain.chains import RetrievalQA
from langchain_core.callbacks import StdOutCallbackHandler
import pandas as pd
import requests



# Load environment variables
load_dotenv(dotenv_path='.env')


True

In [11]:
# Extract text from website
class TextExtractor:
    """Extract clean text from website HTML using BeautifulSoup."""

    @staticmethod
    def extract_text(html_content):
        """Parse HTML and extract main text."""
        try:
            soup = BeautifulSoup(html_content, 'html.parser')
            # Remove scripts and style elements
            for script_or_style in soup(["script", "style"]):
                script_or_style.decompose()

            # Get text
            return ' '.join(soup.stripped_strings)
        except Exception as e:
            print(f"Error parsing the website content: {e}")
            return None

In [12]:
# Download website contents
class WebsiteDownloader:
    """Download website content using requests and extract text using BeautifulSoup."""

    @staticmethod
    def download_website(url):
        """Fetch the HTML content of a website."""
        try:
            response = requests.get(
                url,
                headers={
                    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                    'Accept-Language': 'en-GB,en;q=0.6',
                    'Sec-Ch-Ua': '"Google Chrome";v="128", "Chromium";v="128", ";Not A Brand";v="99"',
                    'Sec-Ch-Ua-Mobile': '?0',
                    'Sec-Ch-Ua-Platform': '"macOS"',
                    'Sec-Ch-Ua-Arch': '"x86"',
                    'Sec-Fetch-Site': 'none',
                    'Sec-Fetch-Mode': 'navigate',
                    'Sec-Fetch-User': '?1',
                    'Sec-Fetch-Dest': 'document',
                    'Upgrade-Insecure-Requests': '1',
                }
            )
            response.raise_for_status()
            return response.text
        except requests.exceptions.RequestException as e:
            print(f"Error downloading the website: {e}")
            return None

    @staticmethod
    def extract_contents_from_links(csv_file):
        """Read links from a CSV file and extract their contents."""
        try:
            links_df = pd.read_csv(csv_file, header=None)  # Assuming there are no headers
            links = links_df[0].tolist()[11:]  # Skip the first 11 links
        except Exception as e:
            print(f"Error reading the CSV file: {e}")
            return {}

        all_contents = {}

        for link in links:
            print(f"Downloading content from: {link}")
            html_content = WebsiteDownloader.download_website(link)
            if html_content:
                text_content = TextExtractor.extract_text(html_content)  # Extract text here
                all_contents[link] = text_content  # Store the extracted text
            # time.sleep(random.uniform(1, 3))  # Delay between requests

        return all_contents

In [13]:
# Setup and add vectorized content to a Vector Database
class VectorDatabase:
    """Store and retrieve text in a vector database."""

    def __init__(self):
        self.embedding_model = OllamaEmbeddings()
        self.pinecone_client = PineconeClient(api_key=os.getenv("PINECONE_API_KEY"))
        self.vector_db = PineconeVectorStore(index=self.pinecone_client.Index("chatbot"), embedding=self.embedding_model)

    def add_text(self, text, source_url):
        """Embed text, split it, and store it in the vector database."""
        try:
            # Split the text into chunks
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=500,
                chunk_overlap=100,
            )
            text_chunks = text_splitter.split_text(text)

            # Convert chunks to Document objects with metadata
            docs = [Document(page_content=chunk, metadata={"source": source_url}) for chunk in text_chunks]

            # Add the documents to the vector database
            self.vector_db.add_documents(docs)

        except Exception as e:
            print(f"Error adding text to vector database: {e}")


    def query_text(self, key):
        """Retrieve text embeddings from the database."""
        pass

def create_qa_chain(vector_db):
    """Create a RetrievalQA chain for querying."""
    try:
        # Initialize the LLM (Chat model)
        chat_model = ChatOllama()  # Or another LLM model

        # Initialize the RetrievalQA chain
        chain = RetrievalQA.from_chain_type(
            llm=chat_model,
            retriever=vector_db.vector_db.as_retriever(),
            callbacks=[StdOutCallbackHandler()]  # Optional: Callback for logging
        )
        
        return chain
    except Exception as e:
        print(f"Error creating QA chain: {e}")
        return None


In [14]:
if __name__ == "__main__":
    # Initialize vector database
    vecdb = VectorDatabase()

    # Example usage
    csv_file_path = "links.csv"  # Ensure this file is in the same directory or provide a full path

    # Extract contents from the CSV file
    contents = WebsiteDownloader.extract_contents_from_links(csv_file_path)
    count = 0

    # Add extracted contents to the vector database
    if contents:
        for url, text_content in contents.items():
            print(f"ADDED WEBSITE TO The DATABASE {count}" )
            count += 1
            # vecdb.add_text(text_content, url)  

Downloading content from: https://www.newzealand.com/nz/feature/canterbury-stargazing-and-lakes-itinerary/
Downloading content from: https://www.newzealand.com/nz/feature/south-island-tranzalpine-itinerary/
Downloading content from: https://www.newzealand.com/nz/feature/south-island-golf-highlights-itinerary/
Downloading content from: https://www.newzealand.com/nz/feature/south-island-southern-scenic-itinerary/
Downloading content from: https://www.newzealand.com/nz/feature/the-great-alpine-highway/
Downloading content from: https://www.newzealand.com/nz/feature/south-island-mountain-biking-itinerary/
Downloading content from: https://www.newzealand.com/nz/feature/south-island-heritage-journey/
Downloading content from: https://www.newzealand.com/nz/feature/south-island-heritage-itinerary/
Downloading content from: https://www.newzealand.com/nz/feature/nature-and-nightlife-in-lower-south-island/
Downloading content from: https://www.newzealand.com/nz/feature/christchurch-to-wellington-

In [15]:

# Create the QA chain
qa_chain = create_qa_chain(vecdb)

# Example query to the QA chain
if qa_chain:
    response = qa_chain.invoke("tell me about Christchurch to Nelson Loop?")
    print(f"Response: {response}")



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Response: {'query': 'tell me about Christchurch to Nelson Loop?', 'result': "Based on the information provided, here is what I can tell you about the Christchurch to Nelson Loop:\n\nThe Christchurch to Nelson Loop is a scenic drive that takes you through some of New Zealand's most beautiful landscapes. The drive is approximately 300 km long and takes around 4-5 hours to complete, depending on the stops you make along the way.\n\nHere are some of the highlights of the loop:\n\n1. Christchurch: Start your journey in Christchurch, a vibrant city with a rich history and culture. Visit the Christchurch Cathedral, take a stroll along the Avon River, and explore the city's many gardens and parks.\n2. Lyttelton: Located just south of Christchurch, Lyttelton is a picturesque port town with a rich history. Take a scenic drive through the town's historic quarter, visit the Lyttelton Museum, and enjoy the views of the harbor 

In [16]:
# Example query to the QA chain
if qa_chain:
    response = qa_chain.invoke("tell me about Yoda?")
    print(f"Response: {response}")



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Response: {'query': 'tell me about Yoda?', 'result': 'I\'m just an AI, I don\'t have access to information about a person named "Yoda" as there is no such person in the context of New Zealand travel or any other topic. Yoda is a fictional character from the Star Wars franchise, so I cannot provide any information about him. My apologies!'}
