In [1]:
# Install dependencies (if needed in Colab)
!pip -q install requests beautifulsoup4 lxml pandas

In [2]:
import requests
from pathlib import Path

URL = "https://github.com/topics"

# Use a desktop-like User-Agent to reduce chances of basic blocking
headers = {
    "User-Agent": (
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/124.0 Safari/537.36"
    )
}

# 1) Fetch the HTML content
resp = requests.get(URL, headers=headers, timeout=30)

# 2) Print status code (should be 200)
print("Status code:", resp.status_code)

# 3) Ensure correct text encoding before saving
if not resp.encoding:
    resp.encoding = resp.apparent_encoding  # fallback if server doesn't specify encoding

html_text = resp.text

# 4) Print first 100 characters to verify
print("\nFirst 100 chars of HTML:")
print(html_text[:100])

# 5) Save to 'webpage.html'
out_path = Path("webpage.html")
out_path.write_text(html_text, encoding=resp.encoding or "utf-8")
print(f"\nSaved to: {out_path.resolve()}")


Status code: 200

First 100 chars of HTML:


<!DOCTYPE html>
<html
  lang="en"
  
  data-color-mode="auto" data-light-theme="light" data-dark-t

Saved to: /content/webpage.html


In [3]:
from bs4 import BeautifulSoup
import re

# Read saved HTML
with open("webpage.html", "r", encoding=resp.encoding or "utf-8") as f:
    saved_html = f.read()

# Create BeautifulSoup object
soup = BeautifulSoup(saved_html, "lxml")  # or "html.parser"

# ---- Selectors (robust with fallbacks) ----
# Topic titles on the page typically appear as prominent links/text within cards.
# We'll try a few known selectors and fall back if GitHub tweaks classes.

def extract_topic_titles(soup):
    candidates = []

    # Common selector for topic title links
    candidates.extend([a.get_text(strip=True) for a in soup.select("a.Link--primary.f3.lh-condensed")])

    # Fallbacks: try headings or links with similar classes
    if not candidates:
        candidates.extend([el.get_text(strip=True) for el in soup.select("p.f3.lh-condensed a.Link--primary")])

    if not candidates:
        candidates.extend([el.get_text(strip=True) for el in soup.select("a[data-view-component='true'].Link--primary")])

    # Deduplicate while preserving order
    seen = set()
    titles = []
    for t in candidates:
        if t and t not in seen:
            seen.add(t)
            titles.append(t)
    return titles

def extract_topic_descriptions(soup):
    candidates = []

    # Typical selector for description under topic title
    candidates.extend([p.get_text(strip=True) for p in soup.select("p.color-fg-muted.f5")])

    # Another frequent muted description style
    if not candidates:
        candidates.extend([p.get_text(strip=True) for p in soup.select("p.color-fg-muted")])

    # Fallback: any small paragraphs near topic cards
    if not candidates:
        candidates.extend([p.get_text(strip=True) for p in soup.select("article p")])

    # Clean and deduplicate
    clean = []
    seen = set()
    for txt in candidates:
        txt = re.sub(r"\s+", " ", txt).strip()
        if txt and txt not in seen:
            seen.add(txt)
            clean.append(txt)
    return clean

titles = extract_topic_titles(soup)
descriptions = extract_topic_descriptions(soup)

print("Number of titles:", len(titles))
print("Titles preview:", titles[:10])
print("\nNumber of descriptions:", len(descriptions))
print("Descriptions preview:", descriptions[:10])


Number of titles: 0
Titles preview: []

Number of descriptions: 16
Descriptions preview: ['React is an open source JavaScript library used for designing user interfaces.', 'TypeScript is a typed superset of JavaScript that compiles to plain JavaScript.', 'React Native is a JavaScript mobile framework developed by Facebook.', 'An awesome list is a list of awesome things curated by the community.', 'Chrome is a web browser from the tech company Google.', 'Automate your code review with style, quality, security, and test‑coverage checks when you need them.', 'Compilers are software that translate higher-level programming languages to lower-level languages (e.g. machine code).', 'Cascading Style Sheets (CSS) is a language used most often to style and improve upon the appearance of views.', 'A database is a structured set of data held in a computer, usually a server.', 'Front end is the programming and layout that people see and interact with.']


In [4]:
import pandas as pd

# Align lengths: take min length so titles/descriptions match row-by-row
n = min(len(titles), len(descriptions))
data = {
    "title": titles[:n],
    "description": descriptions[:n]
}

df = pd.DataFrame(data)
print("DataFrame shape:", df.shape)
df


DataFrame shape: (0, 2)


Unnamed: 0,title,description
