<a href="https://colab.research.google.com/github/mr-adonis-jimenez/web-scraper-project/blob/main/web_scraper_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
!pip -q install requests beautifulsoup4 lxml pandas

In [20]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, UTC

def fetch(url: str) -> str:
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome Safari"
    }
    r = requests.get(url, headers=headers, timeout=30)
    r.raise_for_status()
    return r.text

def parse_example(html: str, url: str) -> pd.DataFrame:
    soup = BeautifulSoup(html, "lxml")
    title = soup.title.get_text(strip=True) if soup.title else ""
    # Replace this with real selectors for your target site
    rows = [{"url": url, "title": title, "scraped_at": datetime.now(UTC).isoformat()}]
    return pd.DataFrame(rows)

url = "https://example.com"
html = fetch(url)
df = parse_example(html, url)
df

Unnamed: 0,url,title,scraped_at
0,https://example.com,Example Domain,2025-12-25T11:32:54.849704+00:00


In [21]:
csv_path = "results.csv"
json_path = "results.json"

df.to_csv(csv_path, index=False)
df.to_json(json_path, orient="records", indent=2)

csv_path, json_path


('results.csv', 'results.json')

In [22]:
urls = [
    "https://example.com",
    "https://quotes.toscrape.com/",
    "https://books.toscrape.com/",
    "https://scrapethissite.com/pages"
]

all_rows = []
for u in urls:
    html = fetch(u)
    tmp = parse_example(html, u)
    all_rows.append(tmp)

df = pd.concat(all_rows, ignore_index=True)
df

Unnamed: 0,url,title,scraped_at
0,https://example.com,Example Domain,2025-12-25T11:32:54.924705+00:00
1,https://quotes.toscrape.com/,Quotes to Scrape,2025-12-25T11:32:55.610245+00:00
2,https://books.toscrape.com/,All products | Books to Scrape - Sandbox,2025-12-25T11:32:56.527650+00:00
3,https://scrapethissite.com/pages,Learn Web Scraping | Scrape This Site | A publ...,2025-12-25T11:32:57.540611+00:00


In [23]:
!pip -q install playwright
!playwright install chromium
!apt-get install -y libatk1.0-0 libatk-bridge2.0-0 libgtk-3-0

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libatk-bridge2.0-0 is already the newest version (2.38.0-3).
libatk1.0-0 is already the newest version (2.36.0-3build1).
libgtk-3-0 is already the newest version (3.24.33-1ubuntu2.2).
0 upgraded, 0 newly installed, 0 to remove and 1 not upgraded.


In [24]:
import asyncio
from playwright.async_api import async_playwright

async def get_rendered_html_async(url: str) -> str:
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url, wait_until="networkidle", timeout=60000)
        html = await page.content()
        await browser.close()
        return html

url = "https://example.com"
html = await get_rendered_html_async(url)
print(html[:500])

<!DOCTYPE html><html lang="en"><head><title>Example Domain</title><meta name="viewport" content="width=device-width, initial-scale=1"><style>body{background:#eee;width:60vw;margin:15vh auto;font-family:system-ui,sans-serif}h1{font-size:1.5em}div{opacity:0.8}a:link,a:visited{color:#348}</style></head><body><div><h1>Example Domain</h1><p>This domain is for use in documentation examples without needing permission. Avoid use in operations.</p><p><a href="https://iana.org/domains/example">Learn more<


In [25]:
%%writefile scraper.py
import requests
from bs4 import BeautifulSoup

def fetch(url: str) -> str:
    headers = {"User-Agent": "Mozilla/5.0"}
    r = requests.get(url, headers=headers, timeout=30)
    r.raise_for_status()
    return r.text

def parse(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    return soup.title.get_text(strip=True) if soup.title else ""


Overwriting scraper.py


In [26]:
from scraper import fetch, parse
print(parse(fetch("https://example.com")))


Example Domain
