# Web Scraping Exercises with BeautifulSoup
Last Updated: September 26th, 2024

## 🌟 Exercise 1 : Parsing HTML with BeautifulSoup

In [None]:

from urllib.request import urlopen
from bs4 import BeautifulSoup

# Example HTML (replace with urlopen if needed)
html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Sports World</title>
</head>
<body>
    <p>Your one-stop destination for the latest sports news and videos.</p>
    <a href="#football">Football</a>
    <a href="#basketball">Basketball</a>
    <a href="#tennis">Tennis</a>
</body>
</html>
"""

soup = BeautifulSoup(html, "html.parser")

# Title
print("Page Title:", soup.title.string)

# Paragraphs
paragraphs = [p.text for p in soup.find_all("p")]
print("Paragraphs:", paragraphs)

# Links
links = [a["href"] for a in soup.find_all("a", href=True)]
print("Links:", links)


## 🌟 Exercise 2 : Scraping robots.txt from Wikipedia

In [None]:

from urllib.request import urlopen

url = "https://en.wikipedia.org/robots.txt"
robots = urlopen(url).read().decode("utf-8")
print(robots)


## 🌟 Exercise 3 : Extracting Headers from Wikipedia’s Main Page

In [None]:

from urllib.request import urlopen
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/Main_Page"
soup = BeautifulSoup(urlopen(url), "html.parser")

headers = [tag.text.strip() for tag in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])]
print(headers)


## 🌟 Exercise 4 : Checking for Page Title

In [None]:

from urllib.request import urlopen
from bs4 import BeautifulSoup

url = "https://en.wikipedia.org/wiki/Main_Page"
soup = BeautifulSoup(urlopen(url), "html.parser")

if soup.title and soup.title.string.strip():
    print("Page has a title:", soup.title.string)
else:
    print("No title found")


## 🌟 Exercise 5 : Analyzing US-CERT Security Alerts

In [None]:

from urllib.request import urlopen
from bs4 import BeautifulSoup
from datetime import datetime

url = "https://www.cisa.gov/news-events/cybersecurity-advisories"
soup = BeautifulSoup(urlopen(url), "html.parser")

current_year = str(datetime.now().year)
alerts = [a.text for a in soup.find_all("a") if current_year in a.text]

print(f"Security alerts in {current_year}: {len(alerts)}")


## 🌟 Exercise 6 : Scraping Movie Details from IMDB

In [None]:

from urllib.request import urlopen
from bs4 import BeautifulSoup
import random

url = "https://www.imdb.com/chart/top/"
soup = BeautifulSoup(urlopen(url), "html.parser")

movies = soup.select("td.titleColumn")
links = soup.select("td.titleColumn a")

random_indices = random.sample(range(len(movies)), 10)

for i in random_indices:
    name = links[i].text
    year = movies[i].span.text.strip("()")
    movie_url = "https://www.imdb.com" + links[i]["href"]
    movie_soup = BeautifulSoup(urlopen(movie_url), "html.parser")
    summary_tag = movie_soup.find("span", {"data-testid": "plot-l"})
    summary = summary_tag.text if summary_tag else "No summary"
    print(f"{name} ({year}): {summary}")
