# This is a basic example on how to scrape & manipulate data using BS4 library

This data comes from a <a href="https://archive.ics.uci.edu/ml/datasets/Auto%2BMPG">public source</a> with citation 
"Dua, D. and Karra Taniskidou, E. (2017). UCI Machine Learning Repository.

In [1]:
from bs4 import BeautifulSoup
import requests, re


In [2]:
#!python3 -m http.server --directory ./Notebooks/SCRAPING/
PATH = 'http://localhost:8000/auto_mpg.html'

In [3]:
result = requests.get(PATH)
result.status_code

200

In [4]:
soup = BeautifulSoup(result.text, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Example Car Dataset
  </title>
  <style>
   body {
        background-color: rgb(0, 0, 0);
        color: rgb(211, 211, 211);
        font-family:'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
        margin: 0 20px;
      }
      a:link {
        color: rgb(211, 211, 211);
      }
      a:visited {
        color: rgb(211, 211, 211);
      }
      div.car_block {
        background-color: rgb(34, 34, 34);
        padding: 15px 20px;
        border-left: 4px solid rgb(232, 10, 137);
      }
      .car_name {
        color: rgb(211, 211, 211);
        font-size: large;
      }
      .mpg {
        font-weight:bold;
      }
      .from {
        color: rgb(106, 116, 116);
        font-size: small;
      }
      .weight {
        font-style: italic;
      }
  </style>
 </head>
 <body>
  <h1>
   Example Car Datasheet (1970-1982)
  </h1>
  <p>
   This data comes from a
   <a href="https://archive.ics.uci.edu/ml

## Navigate our scraped data

In [5]:
soup.title

<title>Example Car Dataset</title>

In [6]:
soup.title.text

'Example Car Dataset'

In [7]:
soup.title.get_text

<bound method Tag.get_text of <title>Example Car Dataset</title>>

In [8]:
soup.title.text

'Example Car Dataset'

In [9]:
soup.html.head == soup.head

True

In [10]:
soup.head.parent.name

'html'

In [11]:
soup.body.text[:500]

'\nExample Car Datasheet (1970-1982)\nThis data comes from a public source with citation \n"Dua, D. and Karra Taniskidou, E. (2017). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science." and \nis referenced in "Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Ka'

In [12]:
# 1st heading
soup.body.h1

<h1>Example Car Datasheet (1970-1982)</h1>

In [13]:
# 1st paragraph
soup.body.p

<p>This data comes from a <a href="https://archive.ics.uci.edu/ml/datasets/Auto%2BMPG">public source</a> with citation 
"Dua, D. and Karra Taniskidou, E. (2017). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science." and 
is referenced in "Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.".
</p>

In [14]:
soup.body.h1.next_sibling.next_sibling

<p>This data comes from a <a href="https://archive.ics.uci.edu/ml/datasets/Auto%2BMPG">public source</a> with citation 
"Dua, D. and Karra Taniskidou, E. (2017). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science." and 
is referenced in "Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.".
</p>

In [15]:
# anchor - in css a link in a specific place
soup.body.a

<a href="https://archive.ics.uci.edu/ml/datasets/Auto%2BMPG">public source</a>

In [16]:
# attributes
soup.body.a.attrs

{'href': 'https://archive.ics.uci.edu/ml/datasets/Auto%2BMPG'}

In [17]:
soup.body.a.attrs['href']

'https://archive.ics.uci.edu/ml/datasets/Auto%2BMPG'

In [18]:
# let's find all links
soup.find_all('a')

[<a href="https://archive.ics.uci.edu/ml/datasets/Auto%2BMPG">public source</a>]

In [19]:
# indeed
len(soup.find_all('a'))

1

In [20]:
type(soup.find_all('a')[0])

bs4.element.Tag

In [21]:
# car blocks have unique ids, let's use it
soup.find(id='car-1')

<div class="car_block" id="car-1"><span class="car_name">Chevrolet Chevelle Malibu</span> <span class="from">(1970, USA)</span> <br/> Achieves <span class="mpg">18.0 mpg</span> with <span class="cylinders">8</span> cylinders backed by <span class="horsepower">130</span> hp, 307.0 cubic inches of displacement, weighing <span class="weight">3,504</span> lbs with 0-60 mph acceleration in <span class="acceleration">12.0</span> seconds</div>

In [22]:
type(soup.find(id='car-1'))

bs4.element.Tag

In [23]:
# if the elt doesn't exist -> none type
soup.find(id='car-0') is None

True

In [24]:
# using class_ with underscore to avoid clash with the 'class' in python
soup.find('div', class_='car_block')

<div class="car_block" id="car-1"><span class="car_name">Chevrolet Chevelle Malibu</span> <span class="from">(1970, USA)</span> <br/> Achieves <span class="mpg">18.0 mpg</span> with <span class="cylinders">8</span> cylinders backed by <span class="horsepower">130</span> hp, 307.0 cubic inches of displacement, weighing <span class="weight">3,504</span> lbs with 0-60 mph acceleration in <span class="acceleration">12.0</span> seconds</div>

In [25]:
len(soup.find('div', class_='car_block'))

16

In [26]:
soup.find_all('div', class_='car_block')[0]

<div class="car_block" id="car-1"><span class="car_name">Chevrolet Chevelle Malibu</span> <span class="from">(1970, USA)</span> <br/> Achieves <span class="mpg">18.0 mpg</span> with <span class="cylinders">8</span> cylinders backed by <span class="horsepower">130</span> hp, 307.0 cubic inches of displacement, weighing <span class="weight">3,504</span> lbs with 0-60 mph acceleration in <span class="acceleration">12.0</span> seconds</div>

## info extraction from a scraped division

In [27]:
div = soup.find_all('div', class_='car_block')[0]

In [28]:
# let's see the 1st division in an html way
print(div.prettify())

<div class="car_block" id="car-1">
 <span class="car_name">
  Chevrolet Chevelle Malibu
 </span>
 <span class="from">
  (1970, USA)
 </span>
 <br/>
 Achieves
 <span class="mpg">
  18.0 mpg
 </span>
 with
 <span class="cylinders">
  8
 </span>
 cylinders backed by
 <span class="horsepower">
  130
 </span>
 hp, 307.0 cubic inches of displacement, weighing
 <span class="weight">
  3,504
 </span>
 lbs with 0-60 mph acceleration in
 <span class="acceleration">
  12.0
 </span>
 seconds
</div>



In [29]:
# retrieve text
div.text

'Chevrolet Chevelle Malibu (1970, USA)  Achieves 18.0 mpg with 8 cylinders backed by 130 hp, 307.0 cubic inches of displacement, weighing 3,504 lbs with 0-60 mph acceleration in 12.0 seconds'

In [30]:
list(div.stripped_strings)

['Chevrolet Chevelle Malibu',
 '(1970, USA)',
 'Achieves',
 '18.0 mpg',
 'with',
 '8',
 'cylinders backed by',
 '130',
 'hp, 307.0 cubic inches of displacement, weighing',
 '3,504',
 'lbs with 0-60 mph acceleration in',
 '12.0',
 'seconds']

In [31]:
# do we have different spans with a unique class for each ?
div.find_all('span')

[<span class="car_name">Chevrolet Chevelle Malibu</span>,
 <span class="from">(1970, USA)</span>,
 <span class="mpg">18.0 mpg</span>,
 <span class="cylinders">8</span>,
 <span class="horsepower">130</span>,
 <span class="weight">3,504</span>,
 <span class="acceleration">12.0</span>]

In [32]:
# so we can use find instead and specify the class
div.find('span', class_='mpg')

<span class="mpg">18.0 mpg</span>

In [33]:
# this'll need a little text processing (remove space)
div.find('span', class_='mpg').text

'18.0 mpg'

In [34]:
# * : for many results, \d+.\d+ : for many digits before and after the float
re.findall('.* (\d+.\d+) cubic inches', div.text)

['307.0']

## Using selectors as an alternative

In [35]:
# obtained with a right click on the elt -> copy -> selector
soup.select('#car-4')

[<div class="car_block" id="car-4"><span class="car_name">Amc Rebel Sst</span> <span class="from">(1970, USA)</span> <br/> Achieves <span class="mpg">16.0 mpg</span> with <span class="cylinders">8</span> cylinders backed by <span class="horsepower">150</span> hp, 304.0 cubic inches of displacement, weighing <span class="weight">3,433</span> lbs with 0-60 mph acceleration in <span class="acceleration">12.0</span> seconds</div>]

In [43]:
# there are different ways to get the same item
soup.select('body > div:nth-of-type(1) > span.mpg')

[<span class="mpg">18.0 mpg</span>]

In [46]:
# begin with number 1
soup.select('body > div:nth-of-type(0) > span.mpg') == []

True

In [38]:
car_blocks = soup.find_all('div', class_='car_block')
car_blocks?