In [1]:
# Core scraping stack
!pip -q install selenium beautifulsoup4 lxml pandas chromedriver-autoinstaller

# Headless Chromium for Colab
!apt -q update
!apt -q install -y chromium-browser


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[?25hGet:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,923 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Get:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Get:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:10 https://ppa.launchpadcontent.net/graphi

In [2]:
import os
import time
import chromedriver_autoinstaller
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# Ensure chromedriver is installed matching the Chromium version
chromedriver_autoinstaller.install()

# Configure headless Chrome (Chromium in Colab)
chrome_options = Options()
chrome_options.add_argument("--headless=new")     # headless mode
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920,1080")
# Point to Chromium binary installed via apt
chrome_options.binary_location = "/usr/bin/chromium-browser"

driver = webdriver.Chrome(options=chrome_options)
driver


<selenium.webdriver.chrome.webdriver.WebDriver (session="24d6d120104ed00eec951f2e9704e93c")>

In [3]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Choose a dynamic product page (hosting plans)
URL = "https://www.inmotionhosting.com/web-hosting/"

driver.get(URL)

# Wait for any plan-like cards to be present.
# We use several alternative locators because sites update classes frequently.
locators = [
    (By.CSS_SELECTOR, "section, div"),  # broad container (we'll filter later in HTML)
]

# Simple wait to ensure JS-rendered content appears
WebDriverWait(driver, 15).until(
    EC.presence_of_element_located(locators[0])
)

# Give a brief extra pause for lazy-loaded bits
time.sleep(2)

page_html = driver.page_source
len(page_html)


988728

In [4]:
import re
from bs4 import BeautifulSoup
from bs4.element import Tag

soup = BeautifulSoup(page_html, "lxml")

# Heuristics to identify "plan cards":
# - container (article/div/section) that contains:
#   * a heading (h2/h3) as plan name
#   * at least one price looking like $X or $X.XX or $X/mo
#   * a list of features (ul/li)
price_pattern = re.compile(r"\$\s*\d+(?:\.\d{1,2})?(?:\s*/\s*mo)?", re.I)

def find_card_candidates(soup):
    candidates = []
    for container in soup.select("article, section, div"):
        # Require a title-like element
        title_el = container.find(["h2", "h3"])
        if not title_el:
            continue
        title_txt = title_el.get_text(" ", strip=True)
        if not title_txt or len(title_txt) < 3:
            continue

        # Require a price somewhere inside
        text = container.get_text(" ", strip=True)
        if not price_pattern.search(text):
            continue

        # Prefer containers that also have features list
        features_list = container.find("ul")
        if not features_list:
            continue

        # Discard overly generic wrappers by requiring a "buy/select" CTA or unique title text
        if not (container.find(string=re.compile(r"buy|select|order|get\s+started|add\s+to\s+cart", re.I)) or len(features_list.find_all("li")) >= 3):
            continue

        candidates.append(container)
    return candidates

cards = find_card_candidates(soup)
print("Cards found:", len(cards))


Cards found: 16


In [5]:
def extract_plan_info(card: Tag):
    # Plan name: try h2 > h3 > strong order
    title_el = card.find(["h2", "h3"])
    title = title_el.get_text(" ", strip=True) if title_el else "N/A"

    # Price: first $.. match inside the card
    text = card.get_text(" ", strip=True)
    m = price_pattern.search(text)
    price = m.group(0) if m else "N/A"

    # Features: list items (limit to first 8 for cleanliness)
    features = []
    ul = card.find("ul")
    if ul:
        for li in ul.find_all("li"):
            li_text = li.get_text(" ", strip=True)
            if li_text:
                features.append(li_text)
    # Deduplicate and keep order
    seen = set()
    deduped = []
    for f in features:
        if f not in seen:
            seen.add(f)
            deduped.append(f)
    # Trim very long items
    deduped = [re.sub(r"\s+", " ", f).strip() for f in deduped][:8]

    return {
        "plan_name": title,
        "price": price,
        "features": deduped
    }

plans = [extract_plan_info(c) for c in cards]

print("Extracted plans:", len(plans))
for i, p in enumerate(plans, 1):
    print(f"{i}. {p['plan_name']} | {p['price']} | {len(p['features'])} features")


Extracted plans: 16
1. High-Performance Hosting Services for Every Need | $4.99 /mo | 8 features
2. High-Performance Hosting Services for Every Need | $4.99 /mo | 8 features
3. High-Performance Hosting Services for Every Need | $4.99 /mo | 8 features
4. High-Performance Hosting Services for Every Need | $4.99 /mo | 8 features
5. High-Performance Hosting Services for Every Need | $4.99 /mo | 8 features
6. High-Performance Hosting Services for Every Need | $4.99 /mo | 8 features
7. Popular Web Hosting Plans | $4.99 /mo | 8 features
8. Popular Web Hosting Plans | $4.99 /mo | 8 features
9. Power | $4.99 /mo | 8 features
10. Power | $4.99 /mo | 8 features
11. Power | $4.99 /mo | 8 features
12. Power | $4.99 /mo | 8 features
13. Power | $4.99 /mo | 8 features
14. Power | $4.99 /mo | 8 features
15. VPS 4 vCPU | $9.99 /mo | 8 features
16. Aspire | $35.00 /mo | 8 features


In [6]:
import pandas as pd

# Flatten features to a short string for the table (keep also the list)
rows = []
for p in plans:
    rows.append({
        "plan_name": p["plan_name"],
        "price": p["price"],
        "features_list": p["features"],
        "features": " • ".join(p["features"])
    })

df = pd.DataFrame(rows, columns=["plan_name", "price", "features", "features_list"])
df


Unnamed: 0,plan_name,price,features,features_list
0,High-Performance Hosting Services for Every Need,$4.99 /mo,English Español Nederlands Deutsch Italiano Fr...,[English Español Nederlands Deutsch Italiano F...
1,High-Performance Hosting Services for Every Need,$4.99 /mo,10 Websites • 200GB NVMe Storage • Unmetered B...,"[10 Websites, 200GB NVMe Storage, Unmetered Ba..."
2,High-Performance Hosting Services for Every Need,$4.99 /mo,10 Websites • 200GB NVMe Storage • Unmetered B...,"[10 Websites, 200GB NVMe Storage, Unmetered Ba..."
3,High-Performance Hosting Services for Every Need,$4.99 /mo,10 Websites • 200GB NVMe Storage • Unmetered B...,"[10 Websites, 200GB NVMe Storage, Unmetered Ba..."
4,High-Performance Hosting Services for Every Need,$4.99 /mo,10 Websites • 200GB NVMe Storage • Unmetered B...,"[10 Websites, 200GB NVMe Storage, Unmetered Ba..."
5,High-Performance Hosting Services for Every Need,$4.99 /mo,10 Websites • 200GB NVMe Storage • Unmetered B...,"[10 Websites, 200GB NVMe Storage, Unmetered Ba..."
6,Popular Web Hosting Plans,$4.99 /mo,10 Websites • 200GB NVMe Storage • Unmetered B...,"[10 Websites, 200GB NVMe Storage, Unmetered Ba..."
7,Popular Web Hosting Plans,$4.99 /mo,10 Websites • 200GB NVMe Storage • Unmetered B...,"[10 Websites, 200GB NVMe Storage, Unmetered Ba..."
8,Power,$4.99 /mo,10 Websites • 200GB NVMe Storage • Unmetered B...,"[10 Websites, 200GB NVMe Storage, Unmetered Ba..."
9,Power,$4.99 /mo,10 Websites • 200GB NVMe Storage • Unmetered B...,"[10 Websites, 200GB NVMe Storage, Unmetered Ba..."


In [7]:
df.to_csv("inmotion_plans.csv", index=False, encoding="utf-8-sig")
df.to_json("inmotion_plans.json", orient="records", force_ascii=False)
print("Saved: inmotion_plans.csv, inmotion_plans.json")


Saved: inmotion_plans.csv, inmotion_plans.json


In [8]:
driver.quit()
print("Selenium WebDriver closed.")


Selenium WebDriver closed.
