# Домашнее задание 3. Парсинг, Git и тестирование на Python

**Цели задания:**

* Освоить базовые подходы к web-scraping с библиотеками `requests` и `BeautisulSoup`: навигация по страницам, извлечение HTML-элементов, парсинг.
* Научиться автоматизировать задачи с использованием библиотеки `schedule`.
* Попрактиковаться в использовании Git и оформлении проектов на GitHub.
* Написать и запустить простые юнит-тесты с использованием `pytest`.


В этом домашнем задании вы разработаете систему для автоматического сбора данных о книгах с сайта [Books to Scrape](http://books.toscrape.com). Нужно реализовать функции для парсинга всех страниц сайта, извлечения информации о книгах, автоматического ежедневного запуска задачи и сохранения результата.

Важной частью задания станет оформление проекта: вы создадите репозиторий на GitHub, оформите `README.md`, добавите артефакты и напишете базовые тесты на `pytest`.


In [2]:
import requests
from bs4 import BeautifulSoup

def get_book_data(url):
    """
    парсит страницу книги и возвращает словарь с инфой
    """
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")

    title = soup.find("h1").text.strip()
    price = soup.find("p", class_="price_color").text.strip()
    avail = soup.find("p", class_="instock availability").text.strip()
    rating_tag = soup.find("p", class_="star-rating")
    rating = rating_tag["class"][1] if rating_tag else "No rating"

    return {
        "title": title,
        "price": price,
        "availability": avail,
        "rating": rating
    }

# проверка
book_url = "http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html"
print(get_book_data(book_url))

{'title': 'A Light in the Attic', 'price': 'Â£51.77', 'availability': 'In stock (22 available)', 'rating': 'Three'}


In [3]:
import time

def scrape_books(pages=3, is_save=True):
    """
    собирает книги с сайта books.toscrape.com
    """
    base = "http://books.toscrape.com/catalogue/page-{}.html"
    all_books = []

    for p in range(1, pages + 1):
        print("страница:", p)
        r = requests.get(base.format(p))
        soup = BeautifulSoup(r.text, "html.parser")

        links = []
        for a in soup.select("h3 a"):
            href = a.get("href").replace("../../", "")
            links.append("http://books.toscrape.com/catalogue/" + href)

        for link in links:
            try:
                data = get_book_data(link)
                all_books.append(data)
            except Exception as e:
                print("ошибка:", e)

        time.sleep(1)

    if is_save:
        with open("books_data.txt", "w", encoding="utf-8") as f:
            for b in all_books:
                f.write(str(b) + "\n")

    print("всего книг собрано:", len(all_books))
    return all_books


# тест
res = scrape_books(pages=2, is_save=False)
print(res[:3])

страница: 1
страница: 2
всего книг собрано: 40
[{'title': 'A Light in the Attic', 'price': 'Â£51.77', 'availability': 'In stock (22 available)', 'rating': 'Three'}, {'title': 'Tipping the Velvet', 'price': 'Â£53.74', 'availability': 'In stock (20 available)', 'rating': 'One'}, {'title': 'Soumission', 'price': 'Â£50.10', 'availability': 'In stock (20 available)', 'rating': 'One'}]


In [4]:
import schedule
import time

def job():
    print("запуск парсинга...")
    scrape_books(pages=3, is_save=True)

# пример проверки (можно временно поставить 1 минуту)
schedule.every().day.at("19:00").do(job)

print("ожидание запуска...")
while True:
    schedule.run_pending()
    time.sleep(60)

ModuleNotFoundError: No module named 'schedule'

In [5]:
!pip install schedule 

zsh:1: command not found: pip


In [6]:
!python3 -m pip install schedule

Collecting schedule
  Downloading schedule-1.2.2-py3-none-any.whl.metadata (3.8 kB)
Downloading schedule-1.2.2-py3-none-any.whl (12 kB)
Installing collected packages: schedule
Successfully installed schedule-1.2.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m


In [7]:
from scraper import get_book_data, scrape_books

def test_get_book_data():
    url = "http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html"
    data = get_book_data(url)
    assert "title" in data
    assert "price" in data
    assert isinstance(data, dict)

def test_scrape_books():
    res = scrape_books(pages=1, is_save=False)
    assert isinstance(res, list)
    assert len(res) > 0

ModuleNotFoundError: No module named 'scraper'

In [8]:
from scraper import get_book_data, scrape_books

ModuleNotFoundError: No module named 'scraper'

In [10]:
!ls

HW_03_python_ds_2025.ipynb


In [11]:
!ls

HW_03_python_ds_2025.ipynb


In [12]:
!ls

HW_03_python_ds_2025.ipynb scraper.py


In [13]:
from scraper import get_book_data, scrape_books

In [14]:
res = scrape_books(pages=1, is_save=False)
print(res[:3])

страница: 1
ошибка при обработке: HTTPConnectionPool(host='books.xn--toscrape-6eh.com', port=80): Max retries exceeded with url: /catalogue/a-light-in-the-attic_1000/index.html (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x10917b790>: Failed to resolve 'books.xn--toscrape-6eh.com' ([Errno 8] nodename nor servname provided, or not known)"))
ошибка при обработке: HTTPConnectionPool(host='books.xn--toscrape-6eh.com', port=80): Max retries exceeded with url: /catalogue/tipping-the-velvet_999/index.html (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x10917bac0>: Failed to resolve 'books.xn--toscrape-6eh.com' ([Errno 8] nodename nor servname provided, or not known)"))
ошибка при обработке: HTTPConnectionPool(host='books.xn--toscrape-6eh.com', port=80): Max retries exceeded with url: /catalogue/soumission_998/index.html (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x10917bf00>: Failed to resolve 'b

In [15]:
res = scrape_books(pages=1, is_save=False)
print(res[:3])

страница: 1
ошибка при обработке: HTTPConnectionPool(host='books.xn--toscrape-6eh.com', port=80): Max retries exceeded with url: /catalogue/a-light-in-the-attic_1000/index.html (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x10917abe0>: Failed to resolve 'books.xn--toscrape-6eh.com' ([Errno 8] nodename nor servname provided, or not known)"))
ошибка при обработке: HTTPConnectionPool(host='books.xn--toscrape-6eh.com', port=80): Max retries exceeded with url: /catalogue/tipping-the-velvet_999/index.html (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x10917b8a0>: Failed to resolve 'books.xn--toscrape-6eh.com' ([Errno 8] nodename nor servname provided, or not known)"))
ошибка при обработке: HTTPConnectionPool(host='books.xn--toscrape-6eh.com', port=80): Max retries exceeded with url: /catalogue/soumission_998/index.html (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x10917bac0>: Failed to resolve 'b

In [16]:
from scraper import scrape_books
res = scrape_books(pages=1, is_save=False)
print(res[:3])

страница: 1
ошибка при обработке: HTTPConnectionPool(host='books.xn--toscrape-6eh.com', port=80): Max retries exceeded with url: /catalogue/a-light-in-the-attic_1000/index.html (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x10917abe0>: Failed to resolve 'books.xn--toscrape-6eh.com' ([Errno 8] nodename nor servname provided, or not known)"))
ошибка при обработке: HTTPConnectionPool(host='books.xn--toscrape-6eh.com', port=80): Max retries exceeded with url: /catalogue/tipping-the-velvet_999/index.html (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x10917bce0>: Failed to resolve 'books.xn--toscrape-6eh.com' ([Errno 8] nodename nor servname provided, or not known)"))
ошибка при обработке: HTTPConnectionPool(host='books.xn--toscrape-6eh.com', port=80): Max retries exceeded with url: /catalogue/soumission_998/index.html (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x109e006b0>: Failed to resolve 'b

In [17]:
!ls


HW_03_python_ds_2025.ipynb scraper.py
[34m__pycache__[m[m


In [1]:
from scraper import scrape_books

res = scrape_books(pages=1, is_save=False)
print(res[:3])

Страница: 1
Не удалось загрузить страницу: HTTPSConnectionPool(host='books.toscrape.glitch.me', port=443): Max retries exceeded with url: /catalogue/page-1.html (Caused by SSLError(SSLCertVerificationError(1, "[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: Hostname mismatch, certificate is not valid for 'books.toscrape.glitch.me'. (_ssl.c:1077)")))
Всего книг собрано: 0
[]


In [2]:
import urllib3
urllib3.disable_warnings()

In [3]:
from scraper import scrape_books

res = scrape_books(pages=1, is_save=False)
print(res[:3])

Страница: 1
Не удалось загрузить страницу: HTTPSConnectionPool(host='books.toscrape.glitch.me', port=443): Max retries exceeded with url: /catalogue/page-1.html (Caused by SSLError(SSLCertVerificationError(1, "[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: Hostname mismatch, certificate is not valid for 'books.toscrape.glitch.me'. (_ssl.c:1077)")))
Всего книг собрано: 0
[]


In [4]:
from scraper import scrape_books

res = scrape_books(pages=1, is_save=False)
print(res[:3])

Страница: 1
Не удалось загрузить страницу: HTTPSConnectionPool(host='books.toscrape.glitch.me', port=443): Max retries exceeded with url: /catalogue/page-1.html (Caused by SSLError(SSLCertVerificationError(1, "[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: Hostname mismatch, certificate is not valid for 'books.toscrape.glitch.me'. (_ssl.c:1077)")))
Всего книг собрано: 0
[]


In [5]:
from scraper import scrape_books
res = scrape_books(pages=1, is_save=False)
print(res[:3])

Страница: 1
Не удалось загрузить страницу: HTTPSConnectionPool(host='books.toscrape.glitch.me', port=443): Max retries exceeded with url: /catalogue/page-1.html (Caused by SSLError(SSLCertVerificationError(1, "[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: Hostname mismatch, certificate is not valid for 'books.toscrape.glitch.me'. (_ssl.c:1077)")))
Всего книг собрано: 0
[]


In [6]:
from scraper import scrape_books
res = scrape_books(pages=1, is_save=False)
print(res[:3])

Страница: 1
Не удалось загрузить страницу: HTTPSConnectionPool(host='books.toscrape.glitch.me', port=443): Max retries exceeded with url: /catalogue/page-1.html (Caused by SSLError(SSLCertVerificationError(1, "[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: Hostname mismatch, certificate is not valid for 'books.toscrape.glitch.me'. (_ssl.c:1077)")))
Всего книг собрано: 0
[]


In [7]:
from scraper import scrape_books
res = scrape_books(pages=1, is_save=False)
print(res[:3])

Страница: 1
Не удалось загрузить страницу: HTTPSConnectionPool(host='books.toscrape.glitch.me', port=443): Max retries exceeded with url: /catalogue/page-1.html (Caused by SSLError(SSLCertVerificationError(1, "[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: Hostname mismatch, certificate is not valid for 'books.toscrape.glitch.me'. (_ssl.c:1077)")))
Всего книг собрано: 0
[]


In [8]:
import importlib, scraper
importlib.reload(scraper)

<module 'scraper' from '/Users/david/Downloads/notebooks/scraper.py'>

In [9]:
from scraper import scrape_books

res = scrape_books(pages=1, is_save=False)
print(res[:3])

Страница: 1
Всего книг собрано: 20
[{'title': 'A Light in the Attic', 'price': 'Â£51.77', 'availability': 'In stock (22 available)', 'rating': 'Three'}, {'title': 'Tipping the Velvet', 'price': 'Â£53.74', 'availability': 'In stock (20 available)', 'rating': 'One'}, {'title': 'Soumission', 'price': 'Â£50.10', 'availability': 'In stock (20 available)', 'rating': 'One'}]
