In [13]:
import gradio as gr
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import ollama
import time

# Define headers for web scraping
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

# Class to fetch webpage content
class Website:
    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        self.soup = BeautifulSoup(self.body, 'html.parser')
        self.title = self.soup.title.string if self.soup.title else "No title found"

        # Clean up text by removing unnecessary elements
        if self.soup.body:
            for irrelevant in self.soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = self.soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""

        links = [link.get('href') for link in self.soup.find_all('a')]
        self.links = [urljoin(self.url, link) for link in links if link]

# Function to filter relevant links using Ollama
def filter_relevant_links(links):
    links_text = "\n".join(links)
    prompt = f"""
    Here is a list of URLs extracted from a company's website:

    {links_text}

    Your task:
    - Identify URLs that would be most relevant to include in a brochure about the company, such as About page, Careers/Job page, Services.
    - Ignore links to terms of service, privacy, login pages, or external sites.
    - Return only relevant URLs without any additional text or explanations.

    Respond with the filtered list.
    """
    
    response = None
    while response is None:  # Keep retrying until we get a valid response
        try:
            response = ollama.chat(model="llama3.2", messages=[{"role": "user", "content": prompt}])
        except Exception as e:
            print(f"Error encountered: {e}. Retrying in 5 seconds...")
            time.sleep(5)  # Wait for 5 seconds before retrying

    relevant_links = response['message']['content'].split("\n")
    return [link.strip() for link in relevant_links if link.strip() and link.startswith("http")]

# Function to generate the brochure
def generate_brochure_from_contents(scraped_text):
    prompt = f"""
    Create a brochure based on the following website content:

    {scraped_text}

    Your task is to generate a professional brochure without any picture that highlights key information about the company, such as:
    - Services
    - Products
    - Values
    - Mission
    - Impact
    - Webpage
    - Contact e-mail

    The brochure should be concise and informative. Do not include placeholders like '[Cover Page]', '[Insert Twitter Handle]', or similar.
    Ensure all content is strictly from the provided context.
    """
    
    system_prompt = """You are a helpful assistant tasked with generating a professional brochure from a company's web content. Follow these rules:
    1. Focus only on the company's relevant information (services, products, values, mission).
    2. Ignore external links, placeholders, or unrelated content.
    3. Do not add unnecessary formatting (e.g., page numbers, cover page, or picture placeholders).
    """

    response = None
    while response is None:  # Keep retrying until we get a valid response
        try:
            response = ollama.chat(model="llama3.2", messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt}
            ])
        except Exception as e:
            print(f"Error encountered: {e}. Retrying in 5 seconds...")
            time.sleep(5)  # Wait for 5 seconds before retrying

    return response['message']['content']

# Convert to markdown format
def convert_to_markdown(content):
    markdown = ""
    lines = content.split('\n')
    
    for line in lines:
        if line.startswith("**"):
            markdown += f"## {line.strip('**')}\n"
        elif line.startswith("*"):
            markdown += f"- {line[2:]}\n"
        else:
            markdown += f"{line}\n"
    
    return markdown

# Define the function to be used in Gradio
def process_url(url):
    if not url.startswith("http"):
        url = "https://" + url  # Ensure URL starts with "http" or "https"

    # Fetch the website
    web = Website(url)

    # Reset the variables to avoid stale data
    filter_links = []  
    web_contents = []  

    # Step 1: Get filtered links
    filter_links = filter_relevant_links(web.links)
    
    if not filter_links:  # If no relevant links are found, return an error message
        return "No relevant links found. Try a different website."

    # Step 2: Extract text from each relevant link
    for link in filter_links:
        web_page = Website(link)  
        
        # Filter out lines with at most 3 words
        filtered_text = "\n".join(line for line in web_page.text.split("\n") if len(line.split()) > 3)
        
        # Store extracted content
        web_contents.append((web_page.title, link, filtered_text))

    # Step 3: Prepare text for Ollama
    scraped_text = "\n\n".join([f"Title: {title}\nLink: {link}\nContent: {text}" for title, link, text in web_contents])

    # Step 4: Generate brochure
    brochure = generate_brochure_from_contents(scraped_text)

    # Convert to Markdown
    markdown_brochure = convert_to_markdown(brochure)

    return markdown_brochure

# Gradio Interface
iface = gr.Interface(
    fn=process_url,
    inputs=gr.Textbox(label="Enter Website URL"),
    outputs=gr.Markdown(label="Generated Brochure"),
    title="Company Brochure Generator",
    description="Enter a company website URL to generate a markdown-formatted brochure.",
)

# Launch Gradio App
iface.launch()


* Running on local URL:  http://127.0.0.1:7867

To create a public link, set `share=True` in `launch()`.


