In [119]:
import pandas as pd
from glob import glob ## to search and list downloaded files
from selenium import webdriver ## to get links and ROI pages
import numpy as np

from pandasql import sqldf ## make possible to run SQL statements over pandas dataframe equally a PostgreSQL or equivalent environment

import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

## Function to scrape data from Internet giving a URL source -> Save a txt file with all book links from URL

In [227]:
def getLinksPages(): # create chrome driver
    
    print('Scrapping from https://books.toscrape.com/ ...')
    
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("headless")

    path_to_chromedriver = "D:/tmp/chromedriver"

    driver = webdriver.Chrome(executable_path=path_to_chromedriver,options=chrome_options)

    # go to website
    links = open("./links.txt","w")
    
    driver.get(f"https://books.toscrape.com/index.html")
    sizeItems  = int(driver.find_elements_by_xpath(f"//body[@id='default']/div/div/div/div/form/strong[1]")[0].text) ### get number of total itens in bookstore
    pagNumber  = int(driver.find_elements_by_xpath(f"//body[@id='default']/div/div/div/div/form/strong[3]")[0].text) ### number of elements per page
    
    for i in range(1,int(sizeItems/pagNumber)+1):
        driver.get(f"https://books.toscrape.com/catalogue/page-{i}.html")

        size_of_list  = driver.find_elements_by_xpath(f"//body[@id='default']/div/div/div/div/section/div[2]/ol/li")
        
        for li in size_of_list[1:len(size_of_list)]:
            article = li.find_elements(By.TAG_NAME, "article")[0]
            h3 = article.find_element_by_xpath('.//preceding::h3[1]')
            a = h3.find_elements(By.TAG_NAME, "a")
            links.write(str(a[0].get_attribute('href'))+'\n')
           

    links.close()
    driver.close()
    
    return links.name

## Data manipulation and ETL: Functions to map attributes from html files -> Return a list with all extract books information

In [126]:
def mapAttributesFromScraping(linksPath):
    
    print('Data manipulation and ETL in progress...')

    links = open(linksPath,"r")
    
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("headless")

    path_to_chromedriver = "D:/tmp/chromedriver"

    driver = webdriver.Chrome(executable_path=path_to_chromedriver,options=chrome_options)
    
    books = []

    # go to website
    for link in links:
        
        driver.get(link)
        
        title = driver.find_elements_by_xpath("//div[@id='content_inner']/article/div/div[2]/h1")[0].text       

        price = float(driver.find_elements_by_xpath("//div[@id='content_inner']/article/div/div[2]/p")[0].text.split('£')[1])

        stock = int(driver.find_elements_by_xpath("//div[@id='content_inner']/article/div/div[2]/p")[1].text.split('In stock (')[1].split(' available)')[0])

        rating = driver.find_elements_by_xpath("//div[@id='content_inner']/article/div/div[2]/p")[2].get_property('attributes')[0]['textContent'].split('star-rating ')[1]
        
        if  (rating == "One"):   rating = 1
        elif(rating == "Two"):   rating = 2
        elif(rating == "Three"): rating = 3
        elif(rating == "Four"):  rating = 4
        elif(rating == "Five"):  rating = 5

        
        book = [title, price, rating, stock]

        if book:
            books.append(book)
            
    driver.close()
    return books

## Execute scraping and ETL functions -> Return a dataframe with Title, Price, Rating and Availability of each book

In [228]:
books = mapAttributesFromScraping(getLinksPages())

booksDF = pd.DataFrame(data=books, columns=["BookTitle", "Price", "Rating", "Availability"])

Unnamed: 0,BookTitle,Price,Rating,Availability
0,A Light in the Attic,51.77,3,22
1,Tipping the Velvet,53.74,1,20
2,Soumission,50.10,1,20
3,Sharp Objects,47.82,4,20
4,Sapiens: A Brief History of Humankind,54.23,5,20
...,...,...,...,...
945,Beyond Good and Evil,43.38,1,1
946,Alice in Wonderland (Alice's Adventures in Won...,55.53,1,1
947,"Ajin: Demi-Human, Volume 1 (Ajin: Demi-Human #1)",57.06,4,1
948,A Spy's Devotion (The Regency Spies of London #1),16.97,5,1


## What is the average price of books by rating (in stars)?

In [229]:
def avgPrinceByRating(booksTable):
    
    result = sqldf("""SELECT Rating, ROUND(AVG(price),2) as "AVG Price (£)" 
                          FROM booksTable 
                          GROUP BY Rating;""")
    
    return result

## Call function given the dataframe 
avgPrinceByRating(booksDF)

Unnamed: 0,Rating,AVG Price (£)
0,1,34.43
1,2,34.48
2,3,34.4
3,4,36.1
4,5,35.38


## How many books have 2 or less copies on a specific day?

In [236]:
def amountCopiesBooks(booksTable):
    result = sqldf("""SELECT  DATE() AS "Currency Date", COUNT(*) as "Books have 2 or less copies"
                        FROM booksTable 
                        WHERE Availability <=2 
                        """)
    
    return result

## Call function given the dataframe 
amountCopiesBooks(booksDF)

Unnamed: 0,Currency Date,Books have 2 or less copies
0,2022-05-17,106
