# Scrap Proj

## Goal

Extract the following book information from the online library:

-  title
-  star rating
-  price
-  availability

Organize data in a structured format and save it in a CSV file.

## Intro

This project is managed in two different parallel ways, using the library for static websites BeautifulSoup or the library for dynamic websites Selenium

The output will be different.<br/>
In the 'availability' feature, the output of Selenium will display the total number of items that are currently in stock.

## CODE

In [None]:
import os, zipfile
import re
import pandas as pd
import requests
from time import sleep

In [None]:
notebook_path = os.path.abspath("scrap_proj.ipynb")

In [None]:
def literal_to_number(n):
    numbers = {
        'zero':0,
        'one':1,
        'two':2,
        'three':3,
        'four':4,
        'five':5
    }
    return numbers[n]

In [None]:
literal_to_number('zero')

## Just Soup

In [None]:
from bs4 import BeautifulSoup

In [None]:
headers = {'User-Agent':'Mozilla/5.0'}
books = []

for x in range(1,51):
    urlx = f'https://books.toscrape.com/catalogue/page-{x}.html'

    response = requests.get(urlx, headers=headers)
    if(response.status_code==200):
        
        soup = BeautifulSoup(response.text, 'html.parser')
        articles = soup.find_all('article', class_='product_pod')

        for art in articles:

            b = {
                'title':art.find('h3').a['title'],
                'rating':art.find('p', class_='star-rating').get('class')[1],
                'price':art.find('p', class_='price_color').text,
                'availability':art.find('p', class_='instock').text
            }
            books.append(b)
            
        print(f"page {urlx} read")
        sleep(2)
    else:
        print(f'ERROR: Something wrong in page {x}. STATUS: {response.status_code}')


In [None]:
for b in books:
    
    rating = literal_to_number(b['rating'].lower())
    b['rating'] = f'{rating}/5'
    b['price'] = re.sub(r".", "", b['price'], count=1)
    b['availability'] = b['availability'].strip()

### Save csv

In [None]:
pd.DataFrame(books, columns=['title','rating','price','availability']).to_csv('books.csv', index=False)

## Just Selenium

In [None]:
!pip install chromedriver-py

In [None]:
from chromedriver_py import binary_path
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
url = 'https://books.toscrape.com'

try:
    service = webdriver.ChromeService(executable_path=binary_path)
    driver = webdriver.Chrome(service=service)
except SessionNotCreatedException as e:
    print("Something wrong. Session not created.")

driver.get(url)

In [None]:
books_list = []

while True:
    try:

        arts = driver.find_elements(By.TAG_NAME, 'img')
        
        for index, art in enumerate(arts):

            arts = driver.find_elements(By.TAG_NAME, 'img')
            art = arts[index]
            
            art.click()
    
            b = {
                'title': driver.find_element(By.TAG_NAME,'h1').text,
                'rating':driver.find_element(By.CLASS_NAME, 'star-rating').get_attribute('class').split(' ')[1],
                'price':driver.find_element(By.CLASS_NAME, 'price_color').text,
                'availability':driver.find_element(By.CLASS_NAME,'availability').text
            }
            books_list.append(b)
            
            sleep(1)
            driver.back()
            sleep(1)

        print(f"Page {driver.current_url} read.")
        
        button = driver.find_element(By.CLASS_NAME, 'next')
        button.find_element(By.XPATH, ".//a").click()
        
        print(f"Landed to page {driver.current_url}")
        sleep(2)
        
    except Exception as e:
        print("Something wrong, scraping interrupted.")
        print("Check the last page read.")
        break

driver.quit()

In [None]:
for b in books_list:
    
    rating = literal_to_number(b['rating'])
    b['rating'] = f'{rating}/5'
    b['price'] = re.sub(r".", "", b['price'], count=1)
    availability = re.search(r'\d+', b['availability']).group()
    b['availability'] = f'{availability} left in stock.'

### Save csv

In [None]:
pd.DataFrame(books_list, columns=['title','rating','price','availability']).to_csv('books.csv', index=False)