### Web Scraping
- Data is stored on a webserver
- A browser shows the content
- The majority of data are JavaScripts {;}, CSS {}, and HTML </>

| JS, CSS, HTML | -> Python -> Data
- Gather data from the webserver inside our Python script


In [2]:
# !pip install lxml
# !pip install requests
# !pip install bs4 # beautiful soup

Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py) ... [?25ldone
[?25h  Created wheel for bs4: filename=bs4-0.0.1-py3-none-any.whl size=1256 sha256=179351f9288d4443cfc979ab6773f84a5d362933881ea8ba0d42ff6056acc8d4
  Stored in directory: /Users/pnatzz/Library/Caches/pip/wheels/d4/c8/5b/b5be9c20e5e4503d04a6eac8a3cd5c2393505c29f02bea0960
Successfully built bs4
Installing collected packages: bs4
Successfully installed bs4-0.0.1


In [3]:
# import libraries
import pandas as pd

In [4]:
data = pd.read_html('https://en.wikipedia.org/wiki/Share_price')

In [5]:
data

[                             Company Price (US$)                Date  \
 0                 Berkshire Hathaway     518'400          April 2022   
 1                   Lindt & Sprüngli     122'000          April 2022   
 2                    Bastfaserkontor      11'435          March 2022   
 3                         Berlin Zoo       9'365           June 2021   
 4                  Financière Moncey       8'711      September 2021   
 5                 Zuger Kantonalbank       7'200            May 2022   
 6                Swiss National Bank       6'371  February 1st, 2022   
 7           Reederei Herbert Ekkenga       5'400       February 2022   
 8                          NVR, Inc.       5'330       November 2021   
 9            Berkeley Group Holdings       4'484          April 2022   
 10             Financière des Sucres       4'355          April 2022   
 11         Ultra Electronics Holding       4'330          April 2022   
 12                          Givaudan       4'017  

In [6]:
type(data)

list

In [7]:
data[0].head()

Unnamed: 0,Company,Price (US$),Date,Industry Notes,Country
0,Berkshire Hathaway,518'400,April 2022,holding company Most expensive share in the wo...,United States
1,Lindt & Sprüngli,122'000,April 2022,chocolate manufacture Most expensive European ...,Switzerland
2,Bastfaserkontor,11'435,March 2022,small real estate company Company name: See „b...,Germany
3,Berlin Zoo,9'365,June 2021,zoo 4000 shares in circulation.,Germany
4,Financière Moncey,8'711,September 2021,holding company; specializing in urban public ...,France


In [8]:
data[1].head()

Unnamed: 0,Authority control,Authority control.1
0,International,FAST
1,National,Germany Israel United States


In [9]:
share_prices = data[0]

In [10]:
# data wrangling
share_prices.shape

(19, 5)

In [11]:
share_prices

Unnamed: 0,Company,Price (US$),Date,Industry Notes,Country
0,Berkshire Hathaway,518'400,April 2022,holding company Most expensive share in the wo...,United States
1,Lindt & Sprüngli,122'000,April 2022,chocolate manufacture Most expensive European ...,Switzerland
2,Bastfaserkontor,11'435,March 2022,small real estate company Company name: See „b...,Germany
3,Berlin Zoo,9'365,June 2021,zoo 4000 shares in circulation.,Germany
4,Financière Moncey,8'711,September 2021,holding company; specializing in urban public ...,France
5,Zuger Kantonalbank,7'200,May 2022,state bank of the Canton of Zug,Switzerland
6,Swiss National Bank,6'371,"February 1st, 2022",central bank,Switzerland
7,Reederei Herbert Ekkenga,5'400,February 2022,tourist ships on the Zwischenahner Meer,Germany
8,"NVR, Inc.",5'330,November 2021,"home construction, mortgage banking",United States
9,Berkeley Group Holdings,4'484,April 2022,"house building, real estate",United Kingdom


In [12]:
share_prices.dtypes

Company           object
Price (US$)       object
Date              object
Industry Notes    object
Country           object
dtype: object

In [13]:
# convert data
share_prices['Prices'] = share_prices["Price (US$)"].str.replace("'", "") # replace ' with nothing

In [14]:
share_prices.head()

Unnamed: 0,Company,Price (US$),Date,Industry Notes,Country,Prices
0,Berkshire Hathaway,518'400,April 2022,holding company Most expensive share in the wo...,United States,518400
1,Lindt & Sprüngli,122'000,April 2022,chocolate manufacture Most expensive European ...,Switzerland,122000
2,Bastfaserkontor,11'435,March 2022,small real estate company Company name: See „b...,Germany,11435
3,Berlin Zoo,9'365,June 2021,zoo 4000 shares in circulation.,Germany,9365
4,Financière Moncey,8'711,September 2021,holding company; specializing in urban public ...,France,8711


In [15]:
share_prices.dtypes

Company           object
Price (US$)       object
Date              object
Industry Notes    object
Country           object
Prices            object
dtype: object

In [16]:
share_prices['Prices'] = pd.to_numeric(share_prices['Prices']) # convert string to integer

In [17]:
share_prices.dtypes

Company           object
Price (US$)       object
Date              object
Industry Notes    object
Country           object
Prices             int64
dtype: object

### requests 
- get the html
- requests.get() is used to get a page

In [1]:
import requests

In [17]:
response = requests.get('http://www.example.com/')

In [18]:
type(response)

requests.models.Response

In [19]:
response.text

'<!doctype html>\n<html>\n<head>\n    <title>Example Domain</title>\n\n    <meta charset="utf-8" />\n    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n    <meta name="viewport" content="width=device-width, initial-scale=1" />\n    <style type="text/css">\n    body {\n        background-color: #f0f0f2;\n        margin: 0;\n        padding: 0;\n        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n        \n    }\n    div {\n        width: 600px;\n        margin: 5em auto;\n        padding: 2em;\n        background-color: #fdfdff;\n        border-radius: 0.5em;\n        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\n    }\n    a:link, a:visited {\n        color: #38488f;\n        text-decoration: none;\n    }\n    @media (max-width: 700px) {\n        div {\n            margin: 0 auto;\n            width: auto;\n        }\n    }\n    </style>    \n</head>\n\n<body>\n<div>\n    <

In [20]:
print(response.text)

<!doctype html>
<html>
<head>
    <title>Example Domain</title>

    <meta charset="utf-8" />
    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />
    <style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
        
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 2em;
        background-color: #fdfdff;
        border-radius: 0.5em;
        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        div {
            margin: 0 auto;
            width: auto;
        }
    }
    </style>    
</head>

<body>
<div>
    <h1>Example Domain</h1>
    <p>This domai

### BeautifulSoup
- Creates a "soup" object that contains all the "ingredients" of the webpage

In [21]:
import bs4

In [22]:
soup = bs4.BeautifulSoup(response.text, "lxml")

In [23]:
soup

<!DOCTYPE html>
<html>
<head>
<title>Example Domain</title>
<meta charset="utf-8"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
        
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 2em;
        background-color: #fdfdff;
        border-radius: 0.5em;
        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        div {
            margin: 0 auto;
            width: auto;
        }
    }
    </style>
</head>
<body>
<div>
<h1>Example Domain</h1>
<p>This domain is for use in illustrative examples

In [25]:
# select title of the page

title = soup.select('title')
title

[<title>Example Domain</title>]

In [26]:
title[0]

<title>Example Domain</title>

In [27]:
title[0].getText()

'Example Domain'

In [28]:
len(title)

1

### How to find CSS selector in Chrome browser
- need to find the CSS selectors (identifier or class)
- right-click and  click inspect

In [29]:
# scrap all elements of a class

import requests
import bs4

In [30]:
url = 'https://en.wikipedia.org/wiki/Share_price'
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'lxml') # get a big document
soup

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-language-alert-in-sidebar-enabled vector-feature-sticky-header-disabled vector-feature-page-tools-enabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width-content-enabled" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Share price - Wikipedia</title>
<script>document.documentElement.className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-language-alert-in-sidebar-enabled vector-feature-sticky-header-disabled vector-feature-page-tools-enabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-enabled vector-feature-main-menu-pinned-disabled vector-feature-limited-width-enabled vector-feature-limited-width

In [31]:
soup.select('div') # get all elements with the <div> tag

[<div class="mw-page-container">
 <a class="mw-jump-link" href="#bodyContent">Jump to content</a>
 <div class="mw-page-container-inner">
 <input class="mw-checkbox-hack-checkbox" id="mw-sidebar-checkbox" type="checkbox"/>
 <header class="mw-header">
 <div class="vector-header-start">
 <nav aria-label="Site" class="vector-main-menu-landmark" role="navigation">
 <div class="vector-menu vector-dropdown vector-menu-dropdown vector-main-menu-dropdown mw-ui-icon-flush-left mw-ui-icon-flush-right" id="vector-main-menu-dropdown">
 <input aria-haspopup="true" aria-label="Main menu" class="vector-menu-checkbox" data-event-name="ui.dropdown-vector-main-menu-dropdown" id="vector-main-menu-dropdown-checkbox" role="button" type="checkbox"/>
 <label aria-hidden="true" class="vector-menu-heading mw-checkbox-hack-button mw-ui-button mw-ui-quiet mw-ui-icon-element" for="vector-main-menu-dropdown-checkbox" id="vector-main-menu-dropdown-label">
 <span class="mw-ui-icon mw-ui-icon-menu mw-ui-icon-wikimedia

In [32]:
soup.select('div span') # get all elements <span> that are within a <div> tag

[<span class="mw-ui-icon mw-ui-icon-menu mw-ui-icon-wikimedia-menu"></span>,
 <span class="vector-menu-heading-label">Main menu</span>,
 <span>Main page</span>,
 <span>Contents</span>,
 <span>Current events</span>,
 <span>Random article</span>,
 <span>About Wikipedia</span>,
 <span>Contact us</span>,
 <span>Donate</span>,
 <span>Help</span>,
 <span>Learn to edit</span>,
 <span>Community portal</span>,
 <span>Recent changes</span>,
 <span>Upload file</span>,
 <span class="mw-logo-container">
 <img alt="Wikipedia" class="mw-logo-wordmark" src="/static/images/mobile/copyright/wikipedia-wordmark-en.svg" style="width: 7.5em; height: 1.125em;"/>
 <img alt="The Free Encyclopedia" class="mw-logo-tagline" height="13" src="/static/images/mobile/copyright/wikipedia-tagline-en.svg" style="width: 7.3125em; height: 0.8125em;" width="117"/>
 </span>,
 <span>Search</span>,
 <span>Create account</span>,
 <span>Log in</span>,
 <span class="mw-ui-icon mw-ui-icon-ellipsis mw-ui-icon-wikimedia-ellipsis"></

In [33]:
soup.select('.mw-headline')

[<span class="mw-headline" id="Behaviour_of_share_prices">Behaviour of share prices</span>,
 <span class="mw-headline" id="Share_prices_in_the_United_States">Share prices in the United States</span>,
 <span class="mw-headline" id="Most_expensive_shares">Most expensive shares</span>,
 <span class="mw-headline" id="List_of_publicly_traded_shares">List of publicly traded shares</span>,
 <span class="mw-headline" id="History">History</span>,
 <span class="mw-headline" id="See_also">See also</span>,
 <span class="mw-headline" id="References">References</span>]

In [34]:
# print all headline

for headline in soup.select('.mw-headline'):
    print(headline.getText())

Behaviour of share prices
Share prices in the United States
Most expensive shares
List of publicly traded shares
History
See also
References


In [40]:
# get image

image_info = soup.select('.thumbimage')
image_info

[<img class="thumbimage" data-file-height="3441" data-file-width="3582" decoding="async" height="211" src="//upload.wikimedia.org/wikipedia/commons/thumb/0/0d/Stock_Price_Listing_Numbers_on_a_Korean_Newspaper.jpg/220px-Stock_Price_Listing_Numbers_on_a_Korean_Newspaper.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/0/0d/Stock_Price_Listing_Numbers_on_a_Korean_Newspaper.jpg/330px-Stock_Price_Listing_Numbers_on_a_Korean_Newspaper.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/0/0d/Stock_Price_Listing_Numbers_on_a_Korean_Newspaper.jpg/440px-Stock_Price_Listing_Numbers_on_a_Korean_Newspaper.jpg 2x" width="220"/>]

In [41]:
len(image_info)

1

In [42]:
image_info[0]

<img class="thumbimage" data-file-height="3441" data-file-width="3582" decoding="async" height="211" src="//upload.wikimedia.org/wikipedia/commons/thumb/0/0d/Stock_Price_Listing_Numbers_on_a_Korean_Newspaper.jpg/220px-Stock_Price_Listing_Numbers_on_a_Korean_Newspaper.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/0/0d/Stock_Price_Listing_Numbers_on_a_Korean_Newspaper.jpg/330px-Stock_Price_Listing_Numbers_on_a_Korean_Newspaper.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/0/0d/Stock_Price_Listing_Numbers_on_a_Korean_Newspaper.jpg/440px-Stock_Price_Listing_Numbers_on_a_Korean_Newspaper.jpg 2x" width="220"/>

In [43]:
for image in image_info:
    print(image['src'])

//upload.wikimedia.org/wikipedia/commons/thumb/0/0d/Stock_Price_Listing_Numbers_on_a_Korean_Newspaper.jpg/220px-Stock_Price_Listing_Numbers_on_a_Korean_Newspaper.jpg


In [51]:
def get_image(url):
    https_url = 'https:' + url
    response = requests.get(https_url)
    return response.content

def save_image(content, image_name):
    with open(image_name, 'wb') as f: # wb is get binary
        f.write(content)

In [53]:
get_image('//upload.wikimedia.org/wikipedia/commons/thumb/0/0d/Stock_Price_Listing_Numbers_on_a_Korean_Newspaper.jpg/220px-Stock_Price_Listing_Numbers_on_a_Korean_Newspaper.jpg')

b'\xff\xd8\xff\xe2\x02\x1cICC_PROFILE\x00\x01\x01\x00\x00\x02\x0clcms\x02\x10\x00\x00mntrRGB XYZ \x07\xdc\x00\x01\x00\x19\x00\x03\x00)\x009acspAPPL\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf6\xd6\x00\x01\x00\x00\x00\x00\xd3-lcms\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\ndesc\x00\x00\x00\xfc\x00\x00\x00^cprt\x00\x00\x01\\\x00\x00\x00\x0bwtpt\x00\x00\x01h\x00\x00\x00\x14bkpt\x00\x00\x01|\x00\x00\x00\x14rXYZ\x00\x00\x01\x90\x00\x00\x00\x14gXYZ\x00\x00\x01\xa4\x00\x00\x00\x14bXYZ\x00\x00\x01\xb8\x00\x00\x00\x14rTRC\x00\x00\x01\xcc\x00\x00\x00@gTRC\x00\x00\x01\xcc\x00\x00\x00@bTRC\x00\x00\x01\xcc\x00\x00\x00@desc\x00\x00\x00\x00\x00\x00\x00\x03c2\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00

In [54]:
for image in image_info:
    image_name = image['src'].split('/')[-1]
    print(image_name)

    content = get_image(image['src'])
    save_image(content, image_name)

220px-Stock_Price_Listing_Numbers_on_a_Korean_Newspaper.jpg
