# ✅ Web Scraping Exercises with BeautifulSoup
Optimized for Google Colab
Last Updated: July 2025

## 🌟 Exercise 1 : Parsing HTML with BeautifulSoup

In [None]:

from bs4 import BeautifulSoup

html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Sports World</title>
</head>
<body>
    <p>Your one-stop destination for the latest sports news and videos.</p>
    <a href="#football">Football</a>
    <a href="#basketball">Basketball</a>
    <a href="#tennis">Tennis</a>
</body>
</html>
"""

soup = BeautifulSoup(html, "html.parser")

print("Page Title:", soup.title.string)
print("Paragraphs:", [p.text for p in soup.find_all("p")])
print("Links:", [a["href"] for a in soup.find_all("a", href=True)])


## 🌟 Exercise 2 : Scraping robots.txt from Wikipedia

In [None]:

from urllib.request import urlopen

robots = urlopen("https://en.wikipedia.org/robots.txt").read().decode("utf-8")
print(robots[:500])  # печатаем первые 500 символов для краткости


## 🌟 Exercise 3 : Extracting Headers from Wikipedia’s Main Page

In [None]:

from urllib.request import Request, urlopen
from bs4 import BeautifulSoup

def get_soup(url):
    req = Request(url, headers={"User-Agent": "Mozilla/5.0"})
    return BeautifulSoup(urlopen(req), "html.parser")

soup = get_soup("https://en.wikipedia.org/wiki/Main_Page")
headers = [tag.text.strip() for tag in soup.find_all(["h1","h2","h3","h4","h5","h6"])]
print(headers)


## 🌟 Exercise 4 : Checking for Page Title

In [None]:

soup = get_soup("https://en.wikipedia.org/wiki/Main_Page")
print("Page has a title:", soup.title.string if soup.title and soup.title.string.strip() else "No title found")


## 🌟 Exercise 5 : Analyzing US-CERT Security Alerts

In [None]:

from datetime import datetime

soup = get_soup("https://www.cisa.gov/news-events/cybersecurity-advisories")
current_year = str(datetime.now().year)
alerts = [a.text.strip() for a in soup.find_all("a") if current_year in a.text and ("CSA" in a.text or "ICSA" in a.text)]
print(f"Security alerts in {current_year}: {len(alerts)}")


## 🌟 Exercise 6 : Scraping Movie Details from IMDB (Optimized)

In [None]:

import random

soup = get_soup("https://www.imdb.com/chart/top/")
movies = soup.select("td.titleColumn")
links = soup.select("td.titleColumn a")

random_indices = random.sample(range(len(movies)), 10)

for i in random_indices:
    name = links[i].text
    year = movies[i].span.text.strip("()")
    movie_url = "https://www.imdb.com" + links[i]["href"]
    movie_soup = get_soup(movie_url)
    summary_tag = movie_soup.select_one('[data-testid="plot-xl"], [data-testid="plot-l"], .sc-16ede01-2')
    summary = summary_tag.text.strip() if summary_tag else "No summary"
    print(f"{name} ({year}): {summary}")
