In [5]:
#Exercise 1
!pip -q install beautifulsoup4 lxml

In [3]:
html_text = """<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Sports World</title>
    <style>
        body { font-family: Arial, sans-serif; }
        header, nav, section, article, footer { margin: 20px; padding: 15px; }
        nav { background-color: #333; }
        nav a { color: white; padding: 14px 20px; text-decoration: none; display: inline-block; }
        nav a:hover { background-color: #ddd; color: black; }
        .video { text-align: center; margin: 20px 0; }
    </style>
</head>
<body>

    <header>
        <h1>Welcome to Sports World</h1>
        <p>Your one-stop destination for the latest sports news and videos.</p>
    </header>

    <nav>
        <a href="#football">Football</a>
        <a href="#basketball">Basketball</a>
        <a href="#tennis">Tennis</a>
    </nav>

    <section id="football">
        <h2>Football</h2>
        <article>
            <h3>Latest Football News</h3>
            <p>Read about the latest football matches and player news.</p>
            <div class="video">
                <iframe width="560" height="315" src="https://www.youtube.com/embed/football-video-id" frameborder="0" allowfullscreen>
                </iframe>
            </div>
        </article>
    </section>

    <section id="basketball">
        <h2>Basketball</h2>
        <article>
            <h3>NBA Highlights</h3>
            <p>Watch highlights from the latest NBA games.</p>
            <div class="video">
                <iframe width="560" height="315" src="https://www.youtube.com/embed/basketball-video-id" frameborder="0" allowfullscreen>
                </iframe>
            </div>
        </article>
    </section>

    <section id="tennis">
        <h2>Tennis</h2>
        <article>
            <h3>Grand Slam Updates</h3>
            <p>Get the latest updates from the world of Grand Slam tennis.</p>
            <div class="video">
                <iframe width="560" height="315" src="https://www.youtube.com/embed/tennis-video-id" frameborder="0" allowfullscreen></iframe>
            </div>
        </article>
    </section>

    <footer>
        <form action="mailto:contact@sportsworld.com" method="post" enctype="text/plain">
            <label for="name">Name:</label><br>
            <input type="text" id="name" name="name"><br>
            <label for="email">Email:</label><br>
            <input type="email" id="email" name="email"><br>
            <label for="message">Message:</label><br>
            <textarea id="message" name="message" rows="4" cols="50"></textarea><br><br>
            <input type="submit" value="Send">
        </form>
    </footer>

</body>
</html>
"""

from pathlib import Path

html_path = Path("/content/sports_world.html")
html_path.write_text(html_text, encoding="utf-8")

html_uri = html_path.as_uri()  # e.g., 'file:///content/sports_world.html'
html_uri

'file:///content/sports_world.html'

In [4]:
# Use urlopen() to fetch the HTML, then parse with BeautifulSoup
from urllib.request import urlopen
from bs4 import BeautifulSoup

# 1) Read the HTML content of the page (via file:// URI)
with urlopen(html_uri) as resp:
    html_bytes = resp.read()

# 2) Create a BeautifulSoup object to parse this HTML
soup = BeautifulSoup(html_bytes, "lxml")  # or "html.parser"

# 3) Find the title of the webpage (<title> tag content)
page_title = soup.title.string if soup.title else None

# 4) Extract all paragraphs (<p> tags) from the page
paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")]

# 5) Retrieve all links from <a href="..."> tags
links = [a["href"] for a in soup.find_all("a", href=True)]

print("Page title:", page_title)
print("\nParagraphs:")
for i, t in enumerate(paragraphs, 1):
    print(f"{i}. {t}")

print("\nLinks (<a href=...>):")
for i, u in enumerate(links, 1):
    print(f"{i}. {u}")

Page title: Sports World

Paragraphs:
1. Your one-stop destination for the latest sports news and videos.
2. Read about the latest football matches and player news.
3. Watch highlights from the latest NBA games.
4. Get the latest updates from the world of Grand Slam tennis.

Links (<a href=...>):
1. #football
2. #basketball
3. #tennis


In [6]:
# Bonus: collect other useful URLs (iframes, forms)
iframes = [i.get("src") for i in soup.find_all("iframe", src=True)]
form_actions = [f.get("action") for f in soup.find_all("form", action=True)]

print("Iframe SRCs:")
for i, u in enumerate(iframes, 1):
    print(f"{i}. {u}")

print("\nForm actions:")
for i, u in enumerate(form_actions, 1):
    print(f"{i}. {u}")

Iframe SRCs:
1. https://www.youtube.com/embed/football-video-id
2. https://www.youtube.com/embed/basketball-video-id
3. https://www.youtube.com/embed/tennis-video-id

Form actions:
1. mailto:contact@sportsworld.com


In [7]:
#Exercise 2
from urllib.request import urlopen

# Wikipedia main page URL
base_url = "https://en.wikipedia.org/wiki/Main_Page"

# robots.txt is always located at the root of the domain
robots_url = "https://en.wikipedia.org/robots.txt"

# Open the URL and read the content
with urlopen(robots_url) as resp:
    robots_content = resp.read().decode("utf-8")

# Display the content
print(robots_content)

﻿# robots.txt for http://www.wikipedia.org/ and friends
#
# Please note: There are a lot of pages on this site, and there are
# some misbehaved spiders out there that go _way_ too fast. If you're
# irresponsible, your access to the site may be blocked.
#

# Observed spamming large amounts of https://en.wikipedia.org/?curid=NNNNNN
# and ignoring 429 ratelimit responses, claims to respect robots:
# http://mj12bot.com/
User-agent: MJ12bot
Disallow: /

# advertising-related bots:
User-agent: Mediapartners-Google*
Disallow: /

# Wikipedia work bots:
User-agent: IsraBot
Disallow:

User-agent: Orthogaffe
Disallow:

# Crawlers that are kind enough to obey, but which we'd rather not have
# unless they're feeding search engines.
User-agent: UbiCrawler
Disallow: /

User-agent: DOC
Disallow: /

User-agent: Zao
Disallow: /

# Some bots are known to be trouble, particularly those designed to copy
# entire sites. Please obey robots.txt.
User-agent: sitecheck.internetseer.com
Disallow: /

User-agent: 

In [8]:
# Save robots.txt to a local file
with open("wikipedia_robots.txt", "w", encoding="utf-8") as f:
    f.write(robots_content)

print("Saved to wikipedia_robots.txt")

Saved to wikipedia_robots.txt


In [9]:
#Exercise 3
from urllib.request import urlopen
from bs4 import BeautifulSoup

# Target URL
url = "https://en.wikipedia.org/wiki/Main_Page"

# 1) Open and read the HTML
with urlopen(url) as resp:
    html_bytes = resp.read()

# 2) Create BeautifulSoup object
soup = BeautifulSoup(html_bytes, "lxml")  # or "html.parser"

In [10]:
# Find all header tags: h1 to h6
headers = []
for level in range(1, 7):
    tag = f"h{level}"
    found = [h.get_text(strip=True) for h in soup.find_all(tag)]
    headers.extend([(tag, text) for text in found])

# Display results
for tag, text in headers:
    print(f"{tag}: {text}")

h1: Main Page
h1: Welcome toWikipedia
h2: From today's featured article
h2: Did you know ...
h2: In the news
h2: On this day
h2: Today's featured picture
h2: Other areas of Wikipedia
h2: Wikipedia's sister projects
h2: Wikipedia languages


In [11]:
import pandas as pd

df_headers = pd.DataFrame(headers, columns=["tag", "text"])
df_headers

Unnamed: 0,tag,text
0,h1,Main Page
1,h1,Welcome toWikipedia
2,h2,From today's featured article
3,h2,Did you know ...
4,h2,In the news
5,h2,On this day
6,h2,Today's featured picture
7,h2,Other areas of Wikipedia
8,h2,Wikipedia's sister projects
9,h2,Wikipedia languages


In [12]:
#Exercise 4
from urllib.request import urlopen
from bs4 import BeautifulSoup

# Example URL (you can replace with any page)
url = "https://en.wikipedia.org/wiki/Main_Page"

# 1) Open and read the HTML
with urlopen(url) as resp:
    html_bytes = resp.read()

# 2) Create BeautifulSoup object
soup = BeautifulSoup(html_bytes, "lxml")

In [13]:
# Check if title exists
if soup.title and soup.title.string:
    print("Page contains a title.")
    print("Title text:", soup.title.string.strip())
else:
    print("Page does NOT contain a title.")

Page contains a title.
Title text: Wikipedia, the free encyclopedia


In [14]:
#Exercise 5
# Install dependencies (BeautifulSoup + lxml for fast parsing)
!pip -q install requests beautifulsoup4 lxml

In [15]:
import math
import time
import datetime as dt
from urllib.parse import urlencode
import requests
from bs4 import BeautifulSoup

# Base listing URL for CISA "Cybersecurity Alerts & Advisories"
BASE_URL = "https://www.cisa.gov/news-events/cybersecurity-advisories"

# Build filtered query for Alerts of a given year
def build_url(year: int, page: int = 0, per_page: int = 50) -> str:
    """
    Construct a URL that filters by:
      - advisory_type:93  (Alert)
      - release_date_year:YYYY
      - items_per_page (to reduce number of pages to crawl)
      - page (0-based on CISA site)
    """
    # The "f[...]" parameters are how cisa.gov encodes filters
    query = {
        "f[0]": "advisory_type:93",              # Alert
        "f[1]": f"release_date_year:{year}",     # Year filter
        "items_per_page": str(per_page),
        "page": str(page),
        "sort_by": "field_release_date",         # sort by date (descending by default)
    }
    return f"{BASE_URL}?{urlencode(query)}"

def count_alert_cards(html: str) -> int:
    """
    Parse a result page and count the number of Alert cards.
    We rely on link pattern '/news-events/alerts/' present in Alert cards.
    """
    soup = BeautifulSoup(html, "lxml")

    # Primary: CISA uses USA.gov design system cards; titles in '.usa-card__heading a'
    cards = soup.select(".usa-card__heading a")
    if not cards:
        # Fallback: look for any h3/a that points to '/news-events/alerts/'
        cards = soup.select("h3 a")

    # Keep only links that clearly belong to "alerts" section
    alert_links = [a for a in cards if a.has_attr("href") and "/news-events/alerts/" in a["href"]]

    # It's possible that ICS advisories appear if filters fail; our href filter protects the count.
    return len(alert_links)

def get_alerts_count_current_year(polite_delay_sec: float = 0.5, per_page: int = 50) -> int:
    """
    Iterate paginated listing and sum all Alert cards for the current year.
    Stops when a page returns zero items.
    """
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/124.0 Safari/537.36"
        )
    }
    session = requests.Session()
    session.headers.update(headers)
    session.timeout = 25

    year = dt.datetime.utcnow().year  # use current UTC year for stability
    total = 0
    page = 0

    while True:
        url = build_url(year=year, page=page, per_page=per_page)
        try:
            r = session.get(url, timeout=25)
            r.raise_for_status()
        except Exception as e:
            print(f"[warn] Failed to fetch page {page}: {e}")
            break

        n = count_alert_cards(r.text)
        # Debug print (optional)
        print(f"Year {year} | page {page} | alerts on page: {n} | url: {url}")

        if n == 0:
            # No more results (or filters changed)
            break

        total += n
        page += 1
        time.sleep(polite_delay_sec)  # be polite

    return year, total

year, total_alerts = get_alerts_count_current_year()
print(f"\nTotal CISA Alerts in {year}: {total_alerts}")

Year 2025 | page 0 | alerts on page: 10 | url: https://www.cisa.gov/news-events/cybersecurity-advisories?f%5B0%5D=advisory_type%3A93&f%5B1%5D=release_date_year%3A2025&items_per_page=50&page=0&sort_by=field_release_date
Year 2025 | page 1 | alerts on page: 10 | url: https://www.cisa.gov/news-events/cybersecurity-advisories?f%5B0%5D=advisory_type%3A93&f%5B1%5D=release_date_year%3A2025&items_per_page=50&page=1&sort_by=field_release_date
Year 2025 | page 2 | alerts on page: 10 | url: https://www.cisa.gov/news-events/cybersecurity-advisories?f%5B0%5D=advisory_type%3A93&f%5B1%5D=release_date_year%3A2025&items_per_page=50&page=2&sort_by=field_release_date
Year 2025 | page 3 | alerts on page: 10 | url: https://www.cisa.gov/news-events/cybersecurity-advisories?f%5B0%5D=advisory_type%3A93&f%5B1%5D=release_date_year%3A2025&items_per_page=50&page=3&sort_by=field_release_date
Year 2025 | page 4 | alerts on page: 10 | url: https://www.cisa.gov/news-events/cybersecurity-advisories?f%5B0%5D=advisory_t

In [16]:
from urllib.parse import urljoin

def collect_alert_links_for_year(year: int, max_pages: int = 20, per_page: int = 50):
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/124.0 Safari/537.36"
        )
    }
    session = requests.Session()
    session.headers.update(headers)
    found = []

    for page in range(max_pages):
        url = build_url(year=year, page=page, per_page=per_page)
        r = session.get(url, timeout=25)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "lxml")

        anchors = soup.select(".usa-card__heading a")
        if not anchors:
            anchors = soup.select("h3 a")

        # Keep only "alerts" links
        page_links = [urljoin(BASE_URL, a["href"]) for a in anchors if a.has_attr("href") and "/news-events/alerts/" in a["href"]]
        if not page_links:  # stop when page is empty
            break
        found.extend(page_links)

    return found

# Example (uncomment to preview 10 links)
# links = collect_alert_links_for_year(year)
# links[:10], len(links)


In [17]:
#Exercise 6
!pip -q install requests beautifulsoup4 lxml

In [18]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

LIST_URL = "https://www.imdb.com/list/ls091294718/"

headers = {
    "User-Agent": (
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/124.0 Safari/537.36"
    )
}

resp = requests.get(LIST_URL, headers=headers)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "lxml")

# Селектор для элементов списка: каждый фильм — '.lister-item-header a'
movie_anchors = soup.select(".lister-item-header a")[:10]
movie_urls = [urljoin("https://www.imdb.com", a["href"]) for a in movie_anchors]

print("First 10 movie URLs:")
for url in movie_urls:
    print(url)


First 10 movie URLs:


In [19]:
import time

results = []

for idx, url in enumerate(movie_urls, start=1):
    r = requests.get(url, headers=headers)
    r.raise_for_status()
    msoup = BeautifulSoup(r.text, "lxml")

    # Заголовок фильма
    title_tag = msoup.find("h1")
    title_text = title_tag.get_text(strip=True) if title_tag else "N/A"

    # Год релиза
    year_tag = msoup.find("span", id="titleYear")
    year_text = year_tag.get_text(strip=True).strip("()") if year_tag else "N/A"

    # Краткое описание
    summary_block = msoup.find("div", class_="summary_text")
    summary_text = summary_block.get_text(strip=True) if summary_block else None

    if not summary_text:
        fallback = msoup.find("div", class_="inline canwrap")
        if fallback:
            summary_text = fallback.get_text(strip=True)

    results.append({
        "title": title_text,
        "year": year_text,
        "summary": summary_text or "N/A",
        "url": url
    })

    print(f"{idx}. {title_text} ({year_text})")
    time.sleep(0.5)  # пауза, чтобы не перегружать сервер


In [20]:
import pandas as pd

df = pd.DataFrame(results, columns=["title", "year", "summary", "url"])
df


Unnamed: 0,title,year,summary,url
