In [1]:
from bs4 import BeautifulSoup
import re, csv, glob, os
from pathlib import Path

In [2]:
PAGES_GLOB = "email_marketing_companies/page_*.html"
OUT_CSV = "email_marketing_products.csv"

In [3]:
def clean(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "")).strip()

In [4]:
def ensure_out_dir(path: str):
    Path(path).parent.mkdir(parents=True, exist_ok=True)

In [5]:
def extract_entry_level_price(card):
    # Usually appears as a "price pill" in the product card
    pill = card.select_one(".product-card__price-pill")
    if pill:
        return clean(pill.get_text(" ", strip=True))
    return None

In [6]:
def find_section_list(card, title: str):
    """
    Extract lists under overview sections inside the card:
    Users / Industries / Market Segment

    Returns list[str]
    """
    # Find title element exactly matching e.g. "Users"
    title_el = card.find(
        lambda t: t and getattr(t, "name", None) in ("div", "span", "h3", "h4", "h5")
        and clean(t.get_text(" ", strip=True)) == title
    )
    if not title_el:
        return []

    # Try to find UL nearby
    parent = title_el.parent
    ul = parent.find("ul") if parent else None
    if not ul:
        ul = title_el.find_next("ul")

    if not ul:
        return []

    items = [clean(li.get_text(" ", strip=True)) for li in ul.find_all("li")]
    return [i for i in items if i and "No information available" not in i]

In [7]:
def extract_rating_and_reviews(card_text: str):
    """
    rating: "4.3 out of 5" -> 4.3
    reviews: "(12,859)" -> 12859
    """
    rating = None
    m = re.search(r"(\d\.\d)\s+out of\s+5", card_text)
    if m:
        rating = float(m.group(1))

    reviews = None
    m = re.search(r"\(([\d,]+)\)", card_text)
    if m:
        reviews = int(m.group(1).replace(",", ""))

    return rating, reviews

In [8]:
def extract_pros_cons(card):
    """
    Returns:
      pros_themes, pros_counts, cons_themes, cons_counts
    Notes:
      This uses a practical heuristic: for each theme row, look at surrounding text
      and decide whether it belongs to the Pros or Cons section.
    """
    pros_themes, pros_counts = [], []
    cons_themes, cons_counts = [], []

    for block in card.select(".text-small-normal"):
        theme_el = block.select_one(".ellipsis")
        count_el = block.select_one(".text-right")
        if not theme_el or not count_el:
            continue

        theme = clean(theme_el.get_text(" ", strip=True))
        count = clean(count_el.get_text(" ", strip=True))

        # Walk up a bit to capture context. Sometimes Pros/Cons label is higher in DOM.
        context = block
        for _ in range(4):
            if context is None:
                break
            ctx_text = clean(context.get_text(" ", strip=True))
            if "Pros" in ctx_text or "Cons" in ctx_text:
                container_text = ctx_text
                break
            context = context.parent
        else:
            container_text = clean(block.find_parent().get_text(" ", strip=True))

        if "Pros" in container_text:
            pros_themes.append(theme)
            pros_counts.append(count)
        elif "Cons" in container_text:
            cons_themes.append(theme)
            cons_counts.append(count)

    return pros_themes, pros_counts, cons_themes, cons_counts

In [9]:
def parse_category_page(path: str):
    with open(path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f.read(), "html.parser")

    rows = []
    for card in soup.select(".product-card"):
        a = card.select_one('a[href^="/products/"], a[href^="https://www.g2.com/products/"]')
        if not a:
            continue

        url = a.get("href") or ""
        if not url:
            continue
        if url.startswith("/"):
            url = "https://www.g2.com" + url

        name_el = card.select_one('[itemprop="name"]') or card.select_one(".product-card__product-name")
        name = clean(name_el.get_text(" ", strip=True)) if name_el else None

        text = clean(card.get_text(" ", strip=True))
        rating, reviews = extract_rating_and_reviews(text)

        entry_price = extract_entry_level_price(card)
        users = find_section_list(card, "Users")
        industries = find_section_list(card, "Industries")
        market = find_section_list(card, "Market Segment")

        pros_t, pros_c, cons_t, cons_c = extract_pros_cons(card)

        rows.append({
            "name": name,
            "product_url": url,
            "rating": rating,
            "reviews": reviews,
            "entry_level_price": entry_price,
            "users": "; ".join(users) if users else None,
            "industries": "; ".join(industries) if industries else None,
            "market_segment": "; ".join(market) if market else None,
            "pros_themes": "; ".join(pros_t) if pros_t else None,
            "pros_counts": "; ".join(pros_c) if pros_c else None,
            "cons_themes": "; ".join(cons_t) if cons_t else None,
            "cons_counts": "; ".join(cons_c) if cons_c else None,
            "source_file": os.path.basename(path),
        })

    return rows

In [10]:
def main():
    ensure_out_dir(OUT_CSV)

    files = sorted(glob.glob(PAGES_GLOB))
    print("Found HTML pages:", len(files))
    if not files:
        raise SystemExit(f"No HTML files found for pattern: {PAGES_GLOB}")

    all_rows = []
    for fp in files:
        page_rows = parse_category_page(fp)
        print(f"{os.path.basename(fp)} → {len(page_rows)} products")
        all_rows.extend(page_rows)

    # Deduplicate by product_url (best key)
    uniq = {}
    for r in all_rows:
        if r.get("product_url"):
            uniq[r["product_url"]] = r

    rows = list(uniq.values())
    print("Total unique products:", len(rows))

    if not rows:
        raise SystemExit("Parsed 0 rows. Check that your HTML files contain .product-card elements.")

    fieldnames = list(rows[0].keys())
    with open(OUT_CSV, "w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=fieldnames)
        w.writeheader()
        w.writerows(rows)

    print("✅ Saved:", OUT_CSV)

if __name__ == "__main__":
    main()

Found HTML pages: 15
page_01.html → 15 products
page_02.html → 15 products
page_03.html → 15 products
page_04.html → 15 products
page_05.html → 15 products
page_06.html → 15 products
page_07.html → 15 products
page_08.html → 15 products
page_09.html → 15 products
page_10.html → 15 products
page_11.html → 15 products
page_12.html → 15 products
page_13.html → 15 products
page_14.html → 15 products
page_15.html → 15 products
Total unique products: 225
✅ Saved: email_marketing_products.csv
