### Web Scraping in Python

In [2]:
# Import the required libraries

import requests
from bs4 import BeautifulSoup

import pandas as pd

In [3]:
# Set the url for the page we are scraping, make a request to get the HTML and parse it.

url = 'http://books.toscrape.com/index.html'

# Get the response
response = requests.get(url)

# Get the html
html = response.content

# Extract the content of html
scraped = BeautifulSoup(html, 'html.parser')

In [4]:
# Check the content of scraped
scraped

<!DOCTYPE html>

<!--[if lt IE 7]>      <html lang="en-us" class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html lang="en-us" class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html lang="en-us" class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en-us"> <!--<![endif]-->
<head>
<title>
    All products | Books to Scrape - Sandbox
</title>
<meta content="text/html; charset=utf-8" http-equiv="content-type"/>
<meta content="24th Jun 2016 09:29" name="created"/>
<meta content="" name="description"/>
<meta content="width=device-width" name="viewport"/>
<meta content="NOARCHIVE,NOCACHE" name="robots"/>
<!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
<!--[if lt IE 9]>
        <script src="//html5shim.googlecode.com/svn/trunk/html5.js"></script>
        <![endif]-->
<link href="static/oscar/favicon.ico" rel="shortcut icon"/>
<link href="static/oscar/css/styles.css" rel="stylesheet" type="text/css"/>
<link href="s

In [5]:
# Check the title of the page

title_text = scraped.title.text.strip()
print(title_text)

All products | Books to Scrape - Sandbox


In [6]:
# Get all the articles in the website

articles = scraped.find_all('article', class_ = 'product_pod')

In [7]:
# Check the data type of articles

type(articles)

bs4.element.ResultSet

In [8]:
# Use for loop to show all the titles

article = [article.h3.a['title'] for article in articles]
    
article

['A Light in the Attic',
 'Tipping the Velvet',
 'Soumission',
 'Sharp Objects',
 'Sapiens: A Brief History of Humankind',
 'The Requiem Red',
 'The Dirty Little Secrets of Getting Your Dream Job',
 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull',
 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics',
 'The Black Maria',
 'Starving Hearts (Triangular Trade Trilogy, #1)',
 "Shakespeare's Sonnets",
 'Set Me Free',
 "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)",
 'Rip it Up and Start Again',
 'Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991',
 'Olio',
 'Mesaerion: The Best Science Fiction Stories 1800-1849',
 'Libertarianism for Beginners',
 "It's Only the Himalayas"]

In [9]:
# Get all the prices of the books

prices = scraped.find_all('p', class_ = 'price_color')

In [10]:
# Check the data type of prices
type(prices)

bs4.element.ResultSet

In [11]:
# Use for loop to print all the prices

price = [float(price.text.lstrip('£')) for price in prices]

price

[51.77,
 53.74,
 50.1,
 47.82,
 54.23,
 22.65,
 33.34,
 17.93,
 22.6,
 52.15,
 13.99,
 20.66,
 17.46,
 52.29,
 35.02,
 57.25,
 23.88,
 37.59,
 51.33,
 45.17]

In [12]:
# Check for availability of the book

book_available = scraped.find_all('p', class_ = 'instock availability')

In [13]:
# Loop over the books and print each

book = [book.text.strip() for book in book_available]
    
book

['In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock',
 'In stock']

In [14]:
# Get the websites

websites = scraped.find_all('article', class_ = 'product_pod')

In [15]:
# Print all the websites

website = [website.h3.a['href'] for website in websites]

website

['catalogue/a-light-in-the-attic_1000/index.html',
 'catalogue/tipping-the-velvet_999/index.html',
 'catalogue/soumission_998/index.html',
 'catalogue/sharp-objects_997/index.html',
 'catalogue/sapiens-a-brief-history-of-humankind_996/index.html',
 'catalogue/the-requiem-red_995/index.html',
 'catalogue/the-dirty-little-secrets-of-getting-your-dream-job_994/index.html',
 'catalogue/the-coming-woman-a-novel-based-on-the-life-of-the-infamous-feminist-victoria-woodhull_993/index.html',
 'catalogue/the-boys-in-the-boat-nine-americans-and-their-epic-quest-for-gold-at-the-1936-berlin-olympics_992/index.html',
 'catalogue/the-black-maria_991/index.html',
 'catalogue/starving-hearts-triangular-trade-trilogy-1_990/index.html',
 'catalogue/shakespeares-sonnets_989/index.html',
 'catalogue/set-me-free_988/index.html',
 'catalogue/scott-pilgrims-precious-little-life-scott-pilgrim-1_987/index.html',
 'catalogue/rip-it-up-and-start-again_986/index.html',
 'catalogue/our-band-could-be-your-life-scene

In [16]:
# Consolidate, articles, prices, stock and websites into a pandas dataframe

books = pd.DataFrame({
    'article_name': article,
    'in_stock': book,
    'has_website': website,
    'price': price
})

In [17]:
# Check the created dataframe

books

Unnamed: 0,article_name,in_stock,has_website,price
0,A Light in the Attic,In stock,catalogue/a-light-in-the-attic_1000/index.html,51.77
1,Tipping the Velvet,In stock,catalogue/tipping-the-velvet_999/index.html,53.74
2,Soumission,In stock,catalogue/soumission_998/index.html,50.1
3,Sharp Objects,In stock,catalogue/sharp-objects_997/index.html,47.82
4,Sapiens: A Brief History of Humankind,In stock,catalogue/sapiens-a-brief-history-of-humankind...,54.23
5,The Requiem Red,In stock,catalogue/the-requiem-red_995/index.html,22.65
6,The Dirty Little Secrets of Getting Your Dream...,In stock,catalogue/the-dirty-little-secrets-of-getting-...,33.34
7,The Coming Woman: A Novel Based on the Life of...,In stock,catalogue/the-coming-woman-a-novel-based-on-th...,17.93
8,The Boys in the Boat: Nine Americans and Their...,In stock,catalogue/the-boys-in-the-boat-nine-americans-...,22.6
9,The Black Maria,In stock,catalogue/the-black-maria_991/index.html,52.15
