In [None]:
# ai_brochure_generator

import os
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# Load environment variables
load_dotenv(override=True)
api_key = os.getenv("OPENAI_API_KEY")
if not api_key or not api_key.startswith("sk-proj-"):
    raise ValueError("⚠️ Invalid or missing OpenAI API key. Check your .env file.")

MODEL = "gpt-4o-mini"
openai = OpenAI()

# Website Scraper Class
class WebScraper:
    def __init__(self, url: str):
        self.url = url
        self.html = self._fetch_html()
        self.soup = BeautifulSoup(self.html, "html.parser")
        self.title = self.soup.title.string.strip() if self.soup.title else "No title found"
        self.text = self._extract_text()
        self.links = self._extract_links()

    def _fetch_html(self) -> str:
        options = Options()
        options.add_argument("--headless")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("window-size=1920,1080")
        options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/124.0.0.0 Safari/537.36")
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        driver.get(self.url)
        html = driver.page_source
        driver.quit()
        return html

    def _extract_text(self) -> str:
        if self.soup.body:
            for tag in self.soup.body(["script", "style", "img", "input"]):
                tag.decompose()
            return self.soup.body.get_text(separator="\n", strip=True)
        return ""

    def _extract_links(self) -> List[str]:
        return [a.get("href") for a in self.soup.find_all("a") if a.get("href") and not a.get("href").startswith("mailto:")]

    def formatted(self) -> str:
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n"

# Prompt for extracting relevant URLs
link_filter_prompt = """
You are provided with a list of links from a company's website. Select those most relevant for a professional brochure.
Include pages such as: About Us, Mission, Vision, Innovation, Sustainability, Products, Careers, and Brand.
Exclude links like: Contact, Terms, Cookies, Email, Store, Support, Investor Relations, and Social Media.

Respond in JSON format:
{
  "links": [
    {"type": "about page", "url": "https://example.com/about"},
    {"type": "careers page", "url": "https://example.com/jobs"}
  ]
}
"""

def generate_link_prompt(scraper: WebScraper) -> str:
    return """Here is a list of links from the website {}:\n{}""".format(scraper.url, "\n".join(scraper.links))

def extract_links(url: str) -> List[dict]:
    scraper = WebScraper(url)
    chat = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_filter_prompt},
            {"role": "user", "content": generate_link_prompt(scraper)}
        ],
        response_format={"type": "json_object"}
    )
    return json.loads(chat.choices[0].message.content)["links"]

def collect_full_content(url: str) -> str:
    main_scraper = WebScraper(url)
    combined = f"Landing Page:\n{main_scraper.formatted()}"
    for item in extract_links(url):
        print(f"Fetching: {item['url']}")
        page = WebScraper(item["url"])
        combined += f"\n\n## {item['type'].capitalize()}\n{page.formatted()}"
    return combined

# Brochure prompt setup
brochure_prompt = """
Jesteś kreatywnym copywriterem, który tworzy rozbudowaną broszurę firmową na podstawie treści pobranej ze strony internetowej.
Piszesz po polsku. W każdej sekcji minimum 4–6 zdań. Używasz markdown, emoji, list, pogrubień i atrakcyjnego języka.
Sekcje: 🎯 Dlaczego my, 🛠️ Produkty, 🤝 Klienci, 🚀 Proces, 🌍 Wartości, 📩 Kontakt. Możesz dodać własne.
Broszura ma wyglądać jak atrakcyjny dokument PDF.
"""

def make_brochure_prompt(company: str, url: str) -> str:
    return f"Firma: {company}\n\nTreść ze strony:\n" + collect_full_content(url)[:100_000]

def display_brochure(company: str, url: str):
    try:
        stream = openai.chat.completions.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": brochure_prompt},
                {"role": "user", "content": make_brochure_prompt(company, url)}
            ],
            stream=True
        )
        response = ""
        display_handle = display(Markdown(""), display_id=True)
        for chunk in stream:
            if hasattr(chunk.choices[0].delta, "content"):
                response += chunk.choices[0].delta.content
                update_display(Markdown(response), display_id=display_handle.display_id)
    except Exception as error:
        print("Stream failed. Fallback mode activated.")
        full = openai.chat.completions.create(
            model=MODEL,
            messages=[
                {"role": "system", "content": brochure_prompt},
                {"role": "user", "content": make_brochure_prompt(company, url)}
            ]
        )
        display(Markdown(full.choices[0].message.content))

# Example usage
if __name__ == "__main__":
    display_brochure("OpenAI", "https://openai.com/")

