# Solutions: Web Data Collection and Parsing Tasks
This notebook provides example solutions for the 10 practical tasks.

## Task 1: Fetch and Parse HTML Title

In [1]:
import requests
from bs4 import BeautifulSoup

url = "https://example.com"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
title = soup.title.string
print("Page Title:", title)

Page Title: Example Domain


## Task 2: Scrape Quotes with BeautifulSoup

In [2]:
import requests
from bs4 import BeautifulSoup

url = "http://quotes.toscrape.com"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
quotes = []
for quote_block in soup.select("div.quote"):
    text = quote_block.find("span", class_="text").get_text(strip=True)
    author = quote_block.find("small", class_="author").get_text(strip=True)
    quotes.append({"quote": text, "author": author})
print(quotes)

[{'quote': '“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”', 'author': 'Albert Einstein'}, {'quote': '“It is our choices, Harry, that show what we truly are, far more than our abilities.”', 'author': 'J.K. Rowling'}, {'quote': '“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”', 'author': 'Albert Einstein'}, {'quote': '“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”', 'author': 'Jane Austen'}, {'quote': "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”", 'author': 'Marilyn Monroe'}, {'quote': '“Try not to become a man of success. Rather become a man of value.”', 'author': 'Albert Einstein'}, {'quote': '“It is better to be hated for what you are than to be loved for what you are not.”', 'author': 'André Gide'}, {'quote':

## Task 3: Read and Filter CSV Data

In [5]:
import csv

with open("products.csv", newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    filtered = [row for row in reader if float(row["price"]) > 20.0]
    for product in filtered:
        print(product)

{'id': '1', 'name': 'Wireless Mouse', 'price': '25.99', 'category': 'Electronics'}
{'id': '7', 'name': 'Bluetooth Speaker', 'price': '45.0', 'category': 'Electronics'}
{'id': '8', 'name': 'Desk Lamp', 'price': '29.95', 'category': 'Home'}
{'id': '9', 'name': 'Backpack', 'price': '49.99', 'category': 'Accessories'}


## Task 4: Convert CSV to JSON

In [6]:
import csv
import json

products = []
with open("products.csv", newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        row["price"] = float(row["price"])
        products.append(row)

with open("products.json", "w", encoding='utf-8') as jsonfile:
    json.dump(products, jsonfile, indent=2)

## Task 5: Parse JSON and Extract Nested Fields

In [7]:
import json

with open("data.json", encoding='utf-8') as f:
    data = json.load(f)

emails = []
for item in data:
    email = item.get("user", {}).get("profile", {}).get("email")
    if email:
        emails.append(email)

print(emails)

['alice.johnson@example.com', 'bob.smith@example.com', 'carol.davis@example.com', 'david.lee@example.com', 'eva.green@example.com']


## Task 6: Read and Convert XML to Dictionary

In [8]:
import xml.etree.ElementTree as ET
from collections import defaultdict

def xml_to_dict(elem):
    d = {}
    for child in elem:
        if list(child):
            d[child.tag] = xml_to_dict(child)
        else:
            d[child.tag] = child.text
    return d

tree = ET.parse("records.xml")
root = tree.getroot()
result = {root.tag: xml_to_dict(root)}
print(result)

{'records': {'record': {'id': '5', 'name': 'Eva Green', 'email': 'eva.green@example.com'}}}


## Task 7: Extract Emails with Regular Expressions

In [9]:
import re

pattern = r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"
with open("emails.txt", encoding='utf-8') as f:
    text = f.read()

emails = re.findall(pattern, text)
print(emails)

['john.doe@example.com', 'jane_smith123@work-mail.org', 'admin@company.com.', 'contact@support.company.net', 'foo.bar@sub.domain.co.uk']


## Task 8: Validate Phone Numbers with Regex

In [10]:
import re

pattern = re.compile(r"^(?:\+1-\d{3}-\d{3}-\d{4}|\(\d{3}\) \d{3}-\d{4})$")

def is_valid_phone(number):
    return bool(pattern.match(number))

# Examples
print(is_valid_phone("+1-800-555-1234"))  # True
print(is_valid_phone("(123) 456-7890"))   # True
print(is_valid_phone("123-456-7890"))     # False

True
True
False


## Task 9: Scrape HTML Table and Save to CSV

In [14]:
import requests
from bs4 import BeautifulSoup
import csv

url = "https://www.iana.org/domains/reserved"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

table = soup.find("table")
headers = [th.get_text(strip=True) for th in table.find_all("th")]

rows = []
for tr in table.find_all("tr")[1:]:
    cols = [td.get_text(strip=True) for td in tr.find_all("td")]
    if cols:
        rows.append(cols)

with open("table_data.csv", "w", newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(headers)
    writer.writerows(rows)