In [1]:
from typing import List
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from fake_useragent import UserAgent




### 1. Сбор ссылок на книги

In [4]:
def get_page_soup(url_link):
    
    response = requests.get(url=url_link, headers={'User-Agent': UserAgent().chrome})
    
    if not response.ok:
        return []
    
    html = response.content
    
    soup = BeautifulSoup(html, 'html.parser')
    
    return soup

In [5]:
main_url = 'http://books.toscrape.com/catalogue/'
page_number = 5

soup = get_page_soup(main_url + f'page-{page_number}.html')
soup.find_all('a')[0].text

'Books to Scrape'

In [6]:
def get_book_links(page_soup) -> List[str]:

    book_links_obj = page_soup.findAll('h3')
    
    books_links = [book_obj.a.attrs['href'] for book_obj in book_links_obj]
    
    return books_links

In [7]:
book_links = []

main_url = 'http://books.toscrape.com/catalogue/'
page_number = 10

for page in range(1, page_number + 1):
    page_soup = get_page_soup(main_url + f'page-{page}.html')
    
    page_book_links = get_book_links(page_soup)
    book_links += page_book_links


### 2. Сбор информации о книгах 

In [36]:
def getBookName(soup):
    
    obj_with_name = soup.find('div', attrs={'class': {'col-sm-6 product_main'}}).h1
    book_name = "" if not obj_with_name else obj_with_name.text
    
    return book_name

def getBookRating(soup):
    
    rating_dict_convert = {
        "One": 1,
        "Two": 2,
        "Three": 3,
        "Four": 4,
        "Five": 5
    }
    
    
    obj_with_rating = soup.find('p', attrs={'class': {'star-rating'}})
    class_name_str = "" if not obj_with_rating else obj_with_rating.attrs['class']
    rating_str = class_name_str[-1]
    rating = rating_dict_convert[rating_str]
    
    return rating

def getBookDescription(soup):
    
    obj_with_description = soup.find('div', attrs={'id': {'product_description'}})
    description = "" if not obj_with_description else obj_with_description.find_next('p').text
    
    return description

def getBookInformation(soup):
    
    table_soup = soup.find_all('table', attrs={'class': 'table'})
    df = pd.read_html(str(table_soup))[0][1]
    
    book_upc = df[0]
    book_type = df[1]
    book_price_without_tax = df[2][1:]
    book_price_with_tax = df[3][1:]
    book_tax = df[4][1:]
    book_availability = df[5]
    book_num_reviews = df[6]

    return book_upc, book_type, book_price_with_tax, book_price_without_tax, book_tax, book_availability, book_num_reviews


In [30]:
def getBookData(book_page):
    
    response = requests.get(book_page, headers={'User-Agent': UserAgent().chrome})
    
    if not response.ok:
        return response.status_code
    
    html = response.content
    soup = BeautifulSoup(html,'html.parser')
    
    
    book_name = getBookName(soup)
    book_rating = getBookRating(soup)
    book_description = getBookDescription(soup)
    
    book_upc, book_type, book_price_with_tax, book_price_without_tax, book_tax, book_availability, book_num_reviews = getBookInformation(soup)
    
    data_row = {"name": book_name, "rating": book_rating,
                "description": book_description, "upc": book_upc,
                "type": book_type, "price_no_tax": book_price_without_tax,
                "price": book_price_with_tax, "tax": book_tax,
                "availability": book_availability, "reviews_count": book_num_reviews}

    return data_row

In [None]:
test_url = 'https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html'

soup = get_page_soup(test_url)

book_name = getBookName(soup)
book_rating = getBookRating(soup)
book_description = getBookDescription(soup)

print(book_name)
print(book_rating)
print(book_description[:50] + "...")

In [37]:
book_links = []

main_url = 'http://books.toscrape.com/catalogue/'
page_number = 2

for page in range(1, page_number + 1):
    page_soup = get_page_soup(main_url + f'page-{page}.html')
    
    page_book_links = get_book_links(page_soup)
    book_links += page_book_links


book_info = []

for book_link in book_links:
    data_row = getBookData(main_url + book_link)
    book_info.append(data_row)


### 3. Созадние DataFrame из полученной информации

In [None]:
book_info_df = pd.DataFrame(book_info)
print(book_info_df.shape)
book_info_df.head(10) 

In [66]:
import re

In [67]:
book_info_df['availability'] = book_info_df['availability'].apply(lambda x: re.search('\d+', x).group(0))

In [68]:
book_info_df.sample(3)

Unnamed: 0,name,rating,description,upc,type,price_no_tax,price,tax,availability,reviews_count
24,Black Dust,5,"No matter how busy he keeps himself, successfu...",00bfed9e18bb36f3,Books,34.53,34.53,0.0,19,0
5,The Requiem Red,1,Patient Twenty-nine.A monster roams the halls ...,f77dbf2323deb740,Books,22.65,22.65,0.0,19,0
21,How Music Works,2,How Music Works is David Byrne’s remarkable an...,327f68a59745c102,Books,37.32,37.32,0.0,19,0


In [74]:
book_info_df[['price_no_tax', 'price', 'tax', 'availability', 'reviews_count']] = book_info_df[['price_no_tax', 'price', 'tax', 'availability', 'reviews_count']].astype('float64')

In [None]:
book_info_df.info()

In [77]:
ans1 = book_info_df[book_info_df['description'] == ""].shape[0]
ans2 = book_info_df[book_info_df['tax'] > 0].shape[0]
ans3 = book_info_df[book_info_df['rating'] == 5].shape[0]\   
ans4 = book_info_df['price_no_tax'].mean()

0