In [1]:
import os
import requests
from bs4 import BeautifulSoup
import json 
from dotenv import load_dotenv
from typing import List
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [2]:
OLLAMA_API = "http://localhost:11434/api/chat"
HEADERS = {"Content-Type": "application/json"}
MODEL = "llama3.2"

In [3]:
header={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
}

class Website:
    def __init__(self,url):
        self.url=url
        response=requests.get(url, headers=header)
        self.body=response.content
        soup=BeautifulSoup(self.body, 'html.parser')
        self.title=soup.title.string if soup.title else "No title found"
        if soup.body:
            for faaltu in soup.body(['script', 'style','img','input']):
                faaltu.decompose()
            self.text=soup.body.get_text(separator='\n', strip=True)
        else:
            self.text=""
        links=[link.get('href') for link in soup.find_all('a')]
        self.links=[link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [4]:
system_prompt="You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"

system_prompt+='You should respond in JSON as in this example:'
system_prompt+="""
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [5]:
def get_user_prompt(website):
    user_prompt=f'Here is the list of links on the website of {website.url} - '
    user_prompt+="please decide which of these are relevant web links  for a brochure about the company, respond with the full https URL in JSON format. \
        do not include Terms of Service, Privacy, email links.\n"
    user_prompt+="Links (some might be relative links):\nGive the output strictly in python list form"
    user_prompt+="\n".join(website.links)

    return user_prompt

In [6]:
def get_links(url):
    website=Website(url)
    ollama_via_openai=OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')
    response=ollama_via_openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content":system_prompt},
            {"role":"user", "content": get_user_prompt(website)}
        ],

            response_format={"type":"json_object"}
    )
    result=response.choices[0].message.content
    return json.loads(result)
    

In [7]:
import time

def retry(url, retries=5, delay=2):
    for attempt in range(retries):
        try:
            return Website(url).get_contents()  
        except (ConnectTimeout, TimeoutError, Exception) as e:
            print(f"Error occurred for {url}: {e}. Attempt {attempt + 1} of {retries}.")
            time.sleep(delay)  
    print(f"Skipping {url} after {retries} failed attempts.")
    return f"Failed to retrieve contents for {url}\n"

def get_details(url):
    result = "Landing page:\n"
    result += retry(url)  
    links = get_links(url)
    print("Found links:", links)
    
    key_to_check = next(iter(links), None) 
    
    if key_to_check == 'links':
        for link in links["links"]:
            result += f"\n\n{link}\n"
            result += retry(link)  
    elif key_to_check == 'relevant_links':
        for link in links["relevant_links"]:
            result += f"\n\n{link}\n"
            result += retry(link)  
    else:
        result += "\nNo relevant links found.\n"
    
    return result


In [13]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a website ( it can be company website, college club website, someone's portfolio) so be prepared \
and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

In [14]:
def brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a professional brief brochure of the website in markdown.At footer always give the contacts link of their social media handles (except phone number) if available\n"
    user_prompt += get_details(url)
    user_prompt = user_prompt[:5_000] 
    return user_prompt

In [15]:
def create_brochure(company_name, url):
    ollama_via_openai=OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')
    response = ollama_via_openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [16]:
def stream_brochure(company_name, url):
    ollama_via_openai=OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')
    stream = ollama_via_openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": brochure_user_prompt(company_name, url)}
          ],
        stream=True
    )
    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

In [17]:
stream_brochure("GitHub", "https://google.com/")

Found links: {'links': ['https://about.google/?fg=1&utm_source=google-IN&utm_medium=referral&utm_campaign=hp-header', 'https://store.google.com/IN?utm_source=hp_header&utm_medium/google_ooo&utm_campaign=GS100042&hl=en-IN', 'https://www.google.co.in/intl/en/about/products', 'https://mail.google.com/', 'https://accounts.google.com/ServiceLogin?hl=en&passive=true&continue=https://www.google.com/&ec=GAZAmgQ']}


**Welcome to GitHub: The World's Largest Community of Coders**

[Image: A group of diverse developers collaborating on a laptop]

At GitHub, we're passionate about building software and making the world a better place. Our mission is simple:

- Give everyone a chance to participate in the world of open source.
- Help people develop great things.
- Offer the best platform for free and open development.

Our Vision: To bring the world closer together by empowering individuals worldwide with technology.



**Who We Are**

GitHub was founded in 2008 by Tom Preston-Werner, Chris Wanstrath, and PJ Hyett. Today, we're a community of over 100 million developers from around the world.



**Our Culture**

At GitHub, we value our team members' well-being above all else. This includes maintaining healthy boundaries between work and life, promoting diversity, equity, inclusion, and belonging (DEIB), as well as fostering psychological safety at every stage of growth to bring out the diverse talents of an individual.

We strive create a place where everyone feels needed, valued and can pursue their passions on coding.



**The GitHub Community**

The GitHub community is home to 100 million registered developers across more than 190 countries. This community is comprised of:
- Open-source contributors.
- Innovators and inventors.
- Professional coders.

It's an inclusive space where developers collaborate with ease, showcase their work, and learn from one another.



**What We Offer**

GitHub helps people develop great things by offering various tools and resources to its developers.



*   **Hosting**: Create private websites for teams, projects
*   **Open Source Fostering**: An incredibly diverse collection of code repositories.
*   **APIs and Webhooks**: Tools for integrating GitHub services into user applications.
*   **Integration with Git**: Services such as Pull Requests and Code Review.

Our mission is to ensure that every person has access to a world in which the internet isn't a barrier to participating. We believe our platform, tools or any of the services we create can bring more people, and communities closer together.



**Careers & Opportunities**

Want to be part of this vibrant community? GitHub offers many diverse career opportunities across a range of roles!

Discover all your available job opening on <https://github.com/careers>



**Connect with Us**

Stay up-to-date with the latest product updates from our team, read about what we're doing in our blog, and ask questions on our forums: 
<a href="https://blog.github.com/">https://blog.github.com/#/follow</a>

<a href='https://github.com/community# '>  GitHub Community - Join our community!</a>