# Scrap Proj

## Goal

Extract the following book information from the online library:

-  title
-  star rating
-  price
-  availability

Organize data in a structured format and save it in a CSV file.

## Intro

This project is managed in two different parallel ways, using the library for static websites BeautifulSoup or the library for dynamic websites Selenium

The output will be different.<br/>
In the 'availability' feature, the output of Selenium will display the total number of items that are currently in stock.

## CODE

In [None]:
import os, zipfile
import re
import pandas as pd
import requests

In [None]:
from bs4 import BeautifulSoup
from time import sleep

Cell to be run on colab to import selenium

In [None]:
# Set up for running selenium in Google Colab
## You don't need to run this code if you do it in Jupyter notebook, or other local Python setting
%%shell
sudo apt -y update
sudo apt install -y wget curl unzip
wget http://archive.ubuntu.com/ubuntu/pool/main/libu/libu2f-host/libu2f-udev_1.1.4-1_all.deb
dpkg -i libu2f-udev_1.1.4-1_all.deb
wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
dpkg -i google-chrome-stable_current_amd64.deb
CHROME_DRIVER_VERSION=`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`
wget -N https://chromedriver.storage.googleapis.com/$CHROME_DRIVER_VERSION/chromedriver_linux64.zip -P /tmp/
unzip -o /tmp/chromedriver_linux64.zip -d /tmp/
chmod +x /tmp/chromedriver
mv /tmp/chromedriver /usr/local/bin/chromedriver
pip install selenium

In [None]:
!pip install chromedriver-autoinstaller

In [None]:
from selenium import webdriver
import chromedriver_autoinstaller

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [None]:
notebook_path = os.path.abspath("scrap_proj.ipynb")

In [None]:
def literal_to_number(n):

    numbers = {
        'Zero':0,
        'One':1,
        'Two':2,
        'Three':3,
        'Four':4,
        'Five':5
    }

    return numbers[n]

## Just Soup

In [None]:
headers = {'User-Agent':'Mozilla/5.0'}
books = []

for x in range(1,51):

    urlx = f'https://books.toscrape.com/catalogue/page-{x}.html'

    response = requests.get(urlx, headers=headers)

    if(response.status_code==200):

        soup = BeautifulSoup(response.text, 'html.parser')
        articles = soup.find_all('article', class_='product_pod')

        for art in articles:

            b = {
                'title':art.find('h3').a['title'],
                'rating':art.find('p', class_='star-rating').get('class')[1],
                'price':art.find('p', class_='price_color').text,
                'availability':art.find('p', class_='instock').text
            }

            books.append(b)

        print(f'page {urlx} read')
        sleep(2)

    else:

        print(f'ERROR: Something wrong in page {x}. STATUS: {response.status_code}')


In [None]:
for b in books:

    rating = literal_to_number(b['rating'])

    b['rating'] = f'{rating}/5'
    b['price'] = re.sub(r".", "", b['price'], count=1)
    b['availability'] = b['availability'].strip()

### Save csv

In [None]:
pd.DataFrame(books, columns=['title','rating','price','availability']).to_csv('books.csv', index=False)

In [None]:
from google.colab import files

files.download('books.csv')

## Just Selenium

WARNING: Selenium section can not works properly on colab due to chromedriver exception.

Optional method to download chromedriver (version 114.0.5735.90) for win32 and linux64

In [None]:
def check_service(system):
    """
    check if service file exists, if not it is created.
    """

    if(system == 0):
        download = 'https://chromedriver.storage.googleapis.com/114.0.5735.90/chromedriver_linux64.zip'
        path_service = './chromedriver_linux64.zip'
        !chmod +x chromedriver

    elif(system == 1):
        download = 'https://chromedriver.storage.googleapis.com/114.0.5735.90/chromedriver_win32.zip'
        path_service = './chromedriver_win32.zip'

    else:
        print("We don't have a service for you. Try with chromedriver_py.")
        return webdriver.ChromeService(executable_path=binary_path)


    if (not os.path.exists(path_service)):

        response = requests.get(download)

        if (response.status_code == 200):
            with open(path_service, 'wb') as file:
                file.write(response.content)

            if(system in [0,1]):
                with zipfile.ZipFile(path_service, 'r') as zip_ref:
                    zip_ref.extractall('./')

                print(f"Service not found. {path_service} created.")

        else:

            print("service download not possible.")

    else:

        print("service found.")

    if(system == 0):

        return webdriver.ChromeService(executable_path='./chromedriver')

    elif(system == 1):

        return webdriver.ChromeService(executable_path='./chromedriver.exe')


In [None]:
url = 'https://books.toscrape.com'

try:

    # setup chrome options
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless') # ensure GUI is off
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')

    # set path to chromedriver as per your configuration
    chromedriver_autoinstaller.install()

    # set up the webdriver
    driver = webdriver.Chrome(options=chrome_options)

except SessionNotCreatedException as e:

    print("Something wrong. Session not created.")

except ConnectionRefusedError as e:

    print("Something wrong. Connection has been refused.")


driver.get(url)

In [None]:
books_list = []

while True:

    try:
        arts = driver.find_elements(By.TAG_NAME, 'img')

        for index, art in enumerate(arts):

            arts = driver.find_elements(By.TAG_NAME, 'img')

            art = arts[index]

            sleep(1)
            art.click()

            b = {
                'title': driver.find_element(By.TAG_NAME,'h1').text,
                'rating':driver.find_element(By.CLASS_NAME, 'star-rating').get_attribute('class').split(' ')[1],
                'price':driver.find_element(By.CLASS_NAME, 'price_color').text,
                'availability':driver.find_element(By.CLASS_NAME,'availability').text
            }
            books_list.append(b)

            sleep(1)
            driver.back()
            sleep(1)

        print(f"Page {driver.current_url} read.")

        button = driver.find_element(By.CLASS_NAME, 'next')
        button.find_element(By.XPATH, ".//a").click()

        print(f"Landed to page {driver.current_url}")

    except Exception as e:

        print("Something wrong, scraping interrupted.")
        print("Check the last page read.")
        break


driver.quit()

In [None]:
for b in books_list:

    rating = literal_to_number(b['rating'])
    b['rating'] = f'{rating}/5'
    b['price'] = re.sub(r".", "", b['price'], count=1)
    availability = re.search(r'\d+', b['availability']).group()
    b['availability'] = f'{availability} left in stock.'

### Save csv

In [None]:
pd.DataFrame(books_list, columns=['title','rating','price','availability']).to_csv('books.csv', index=False)