<a href="https://colab.research.google.com/github/mayukvtypetone/mayukvtypetone/blob/main/Scrape_Articles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install openai
!pip install beautifulsoup4
!pip install httpx
!pip install lxml
!pip install markdownify

Collecting openai
  Downloading openai-1.6.1-py3-none-any.whl (225 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.4/225.4 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.26.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.9/75.9 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting typing-extensions<5,>=4.7 (from openai)
  Downloading typing_extensions-4.9.0-py3-none-any.whl (32 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.2-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.9/76.9 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0

In [None]:
from openai import OpenAI
from bs4 import BeautifulSoup
import httpx
from urllib.parse import urlparse
import json
from timeit import default_timer as timer
from datetime import timedelta
import markdownify
from pprint import pprint

In [None]:
from google.colab import userdata
api_key = userdata.get('OPENAI_API_KEY')
openai_client = OpenAI(
    api_key=api_key
)

In [None]:
def get_sitemap_locs(url):
  raw_xml = httpx.get(url, follow_redirects=True)
  soup = BeautifulSoup(raw_xml.content, "xml")
  urls = [loc.text for loc in soup.find_all("loc")]
  return urls

def get_sitemap_index(url):
  source = httpx.get(url, follow_redirects=True)
  soup = BeautifulSoup(source.content, "html.parser")
  sitemap_link = soup.find("link", rel="sitemap")["href"]
  return sitemap_link

def crawl_sitemaps(url, sitemaps_used = [], urls = []):
  if url in sitemaps_used:
    return

  suffix = ""
  if not len(sitemaps_used):
    suffix = get_sitemap_index(url).strip("/") if not url.endswith(".xml") else ""
  sitemaps_used.append(url + suffix)

  loc_urls = get_sitemap_locs(url + suffix)
  for loc_url in loc_urls:
    if loc_url.endswith(".xml"):
      crawl_sitemaps(loc_url, sitemaps_used)
    else:
      if loc_url not in urls:
        urls.append(loc_url)

  return urls


In [None]:
all_links = crawl_sitemaps("https://www.typeface.ai/")

In [None]:
function_name = "get_css_selector_of_full_article"
tools = [
    {
        "type": "function",
        "function": {
            "name": function_name,
            "description": "Extract the CSS selectors for your blogpost",
            "parameters": {
                "type": "object",
                "properties": {
                    "css_selector_of_article_title": {
                        "type": "string",
                        "description": "The css selector for bs4 for to extract the article title",
                    },
                    "css_selector_of_full_article_body": {
                        "type": "string",
                        "description": "The css selector of the HTML element that contains the entire article body"
                    },
                    "css_selector_of_article_thumbnail": {
                        "anyOf": [
                            {
                              "type": "string",
                              "description": "The css selector for the article thumbnail used. This is an image inside the main container that has the article in it or just outside it. NOT A LOGO. It's usually a big image with a specific dimension like 16:9 or 4:3 or 1:1 and is usually quite big like 1024px."
                            },
                            {
                                "type": "null",
                                "description": "In case there is not article thumbnail to be found."
                            }
                        ]
                    }
                },
                "required": ["css_selector_of_article_title", "css_selector_of_full_article_body", "css_selector_of_thumbnail"],
            },
        }
    },
]

In [None]:
def classify_article_page(html):
  response = openai_client.chat.completions.create(
      model="gpt-4-1106-preview",
      temperature=0.0,
      messages=[
          {"role": "system", "content": "You exist to classify HTML pages as pages that contain full blog articles or do not contain it\n\n- You may only answer with \"YES\" or \"NO\""},
          {"role": "user", "content": f"{html}\n\nDoes this HTML page contain a full article or not?"}
      ],
      max_tokens=50
  )
  answer = response.choices[0].message.content
  return "YES" in answer.upper()

def get_article(url):
  article_body = httpx.get(url)

  article_body_str = article_body.content.decode("utf-8")
  soup = BeautifulSoup(article_body_str, "html.parser")
  soup = soup.find("body")
  response = openai_client.chat.completions.create(
    model="gpt-4-1106-preview",
    tools=tools,
    tool_choice={"type": "function", "function": {"name": function_name}},
    temperature=0.0,
    messages=[
        {"role": "system", "content": "You are a perfect blog scraper"},
        {"role": "user", "content": f"{soup.decode_contents()}"}
    ],
    max_tokens=800
  )
  css_selectors = json.loads(response.choices[0].message.tool_calls[0].function.arguments)
  title_selector = css_selectors["css_selector_of_article_title"]
  body_selector = css_selectors["css_selector_of_full_article_body"]
  thumbnail_selector = css_selectors["css_selector_of_article_thumbnail"]
  article_body_html = soup.select(body_selector)
  article_thumbnail_html = soup.select(thumbnail_selector)
  article_title_html = soup.select(title_selector)
  thumbnail_url = None
  if len(article_thumbnail_html) > 0 and article_thumbnail_html[0] is not None:
    thumbnail_el = article_thumbnail_html[0]
    if thumbnail_el.name == "img":
      thumbnail_url = thumbnail_el.get("src")
    else:
      thumbnail_url = thumbnail_el.find("img").get("src")
  return {
    "url": url,
    "thumbnail_url": thumbnail_url,
    "body_html": "".join([markdownify.markdownify(c.decode_contents() or "", heading_style="ATX") for c in article_body_html]),
    "title": article_title_html[0].text
  }

In [None]:
start = timer()
articles = []
for url in all_links:
  try:
    r = httpx.get(url, follow_redirects=True)
    headers = dict(r.headers)
    headers = {k.lower():v for k, v in headers.items()}

    if headers.get("content-type") is not None and "text/html" not in headers["content-type"]:
      continue
  except Exception as e:
    print(e)
    continue

  print(url)
  print(r.status_code)
  soup = BeautifulSoup(r.content, "html.parser")
  soup = soup.find("body")
  print(soup)
  [s.decompose() for s in soup.find_all("script")]
  [s.decompose() for s in soup.find_all("style")]
  print(soup)
  try:
    if classify_article_page(soup.decode_contents()):
      articles.append(get_article(url))
      if len(articles) > 50:
        break
  except Exception as e:
    print(e)
    pass

end = timer()
print(timedelta(seconds=end-start))

https://www.typeface.ai/blog
200
<body><div id="___gatsby"><div id="gatsby-focus-wrapper" style="outline:none" tabindex="-1"><script type="text/javascript">
    (function() {
      document.body.dataset.cloak = 'true';
    })();
  </script><header class="sc-p93b2l-0 eGiwoN" data-md-cloak="true" id="header"><div class="sc-p93b2l-24 hzdkYK" data-headlessui-state=""><div class="sc-p93b2l-1 igWHFx"><div class="sc-1mgs8b2-0 dToKCk"><nav aria-label="Global" class="sc-p93b2l-3 hlqOIr"><div class="sc-p93b2l-5 cbYrpZ"><a data-partially-current="true" href="/"><span class="sr-only">Typeface logo</span><svg class="svg-header-logo-130" data-testid="svg-icon" height="34" viewbox="0 0 130 34" width="130"><path d="M46.0676 11.8347v12.5272h2.9415V11.8347h4.6077V9.44861H41.4805v2.38609h4.5871Zm7.3506 16.1269h1.1931c2.7564 0 3.3323-.4936 4.6282-3.9288l4.0523-10.7375H60.412l-2.5712 7.9811-2.5096-7.9811h-3.1061l4.2374 11.0254-.1234.3086c-.4731 1.1725-.905 1.152-1.8512 1.152h-1.0697v2.1803Zm11.1507 0h2.838

KeyboardInterrupt: 

In [None]:
print(len(articles))

0


In [None]:
from IPython.display import display, Markdown, Latex
import random

a = articles[random.randint(0,len(articles)-1)]
print(a["url"])
print(a["thumbnail_url"])
display(Markdown(f'# {a["title"]}\n\n![Thumbnail]({a["thumbnail_url"]})\n{a["body_html"]}'))

ValueError: empty range for randrange() (0, 0, 0)