In [1]:
# HCS Workshop 2, Web Scraping

In [2]:
# Author: Will Cooper

In [7]:
# import requests package and set up page

import requests
page = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html")
print(page.status_code)
print(page.content)

200
b'<!DOCTYPE html>\n<html>\n    <head>\n        <title>A simple example page</title>\n    </head>\n    <body>\n        <p>Here is some simple content for this page.</p>\n    </body>\n</html>'


In [10]:
# import BeautifulSoup and make a "BeautifulSoup object"
# sudo apt-get install python-bs4

from bs4 import BeautifulSoup
soup = BeautifulSoup(page.content, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<p>Here is some simple content for this page.</p>
</body>
</html>


<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>


In [19]:
# list soup children, title, etc. 

list(soup.children)
list(soup.title)
print(soup.title.parent.name)
print(soup.p)
print()

head
<p>Here is some simple content for this page.</p>



In [22]:
# more printing
html = list(soup.children)[2]
list(html.children)
body = list(html.children)[3]
list(body.children)
p = list(body.children)[1]
p.get_text()


'Here is some simple content for this page.'

In [23]:
# find function
soup.find('p')
soup.find('head')

<head>
<title>A simple example page</title>
</head>

In [6]:
# more sources? 
# https://beautiful-soup-4.readthedocs.io/en/latest/
# https://www.dataquest.io/blog/web-scraping-tutorial-python/

In [24]:
page = requests.get('http://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168')
soup = BeautifulSoup(page.content, 'html.parser')
seven_day = soup.find(id = 'seven-day-forecast')
forecast_items = seven_day.find_all(class_='tombstone-container')
tonight = forecast_items[0]
print(tonight.prettify())

<div class="tombstone-container">
 <p class="period-name">
  Today
  <br/>
  <br/>
 </p>
 <p>
  <img alt="Today: Mostly sunny, with a high near 70. Light west northwest wind becoming west 5 to 10 mph in the afternoon. " class="forecast-icon" src="newimages/medium/sct.png" title="Today: Mostly sunny, with a high near 70. Light west northwest wind becoming west 5 to 10 mph in the afternoon. "/>
 </p>
 <p class="short-desc">
  Mostly Sunny
 </p>
 <p class="temp temp-high">
  High: 70 °F
 </p>
</div>


In [25]:
period = tonight.find(class_='period-name').get_text()
short_desc = tonight.find(class_='short-desc').get_text()
temp = tonight.find(class_='temp').get_text()
print(period)
print(short_desc)
print(temp)


Today
Mostly Sunny
High: 70 °F


In [42]:
# 'https://markets.businessinsider.com/stocks'
page = requests.get('https://markets.businessinsider.com/stocks')
soup = BeautifulSoup(page.content, 'html.parser')
stocks = soup.find(id='shares_topflop_StockPricesSharesTopFlop')
imte = stocks.find('a')

prices = stocks.find_all(class_='row-hover')


for price in prices:
    print(price.get_text())
    print('----------------')
    
names = []

for i in range(len(prices)):
    names.append(prices[i].find('a').get_text())

print(names)



Goldman Sachs Gr


205.73





1.95 %


3.93


2:11:48 PM


----------------


IBM


124.18





1.78 %


2.17


2:12:07 PM


----------------


Chevron


73.96





1.73 %


1.26


2:12:13 PM


----------------


Intel


52.47





1.51 %


0.78


2:11:39 PM


----------------


3M


165.20





1.51 %


2.45


2:12:06 PM


----------------


Home Depot


280.52





-0.56 %


-1.58


2:12:06 PM


----------------


Microsoft


208.66





-0.82 %


-1.72


2:12:33 PM


----------------


Merck


80.40





-1.03 %


-0.84


2:12:26 PM


----------------


Apple


115.25





-1.07 %


-1.25


2:12:24 PM


----------------


Boeing Co


165.74





-3.19 %


-5.46


2:12:30 PM


----------------
['Goldman Sachs Gr', 'IBM', 'Chevron', 'Intel', '3M', 'Home Depot', 'Microsoft', 'Merck', 'Apple', 'Boeing Co']


In [28]:
# regular expressions, searching by text 
# https://docs.python.org/3/library/re.html

import re

bruh = stocks.find_all('a', text = re.compile("Apple"))
print(bruh)


[<a href="/stocks/aapl-stock" title="Apple-stock">Apple</a>]


In [35]:
# https://stackoverflow.com/questions/47928608/how-to-use-beautifulsoup-to-parse-google-search-results-in-python
# quick example of using BeautifulSoup to Google for you 

import urllib
from bs4 import BeautifulSoup
import requests
import webbrowser

text = 'hello world'
text = urllib.parse.quote_plus(text)

url = 'https://google.com/search?q=' + text

response = requests.get(url)

with open('output.html', 'wb') as f:
    f.write(response.content)
webbrowser.open('output.html')

soup = BeautifulSoup(response.text, 'html.parser')

for g in soup.find_all(class_='BNeawe vvjwJb AP7Wnd'):
    print(g.get_text())
    print('---')


Lady Antebellum - Hello World - YouTube
---
"Hello, World!" program - Wikipedia
---
Hello, World! - Learn Python - Free Interactive Python Tutorial
---
Hello World · GitHub Guides
---
Hello World Studio
---
Hello World! - Manning
---
HelloWorld - Digital promotions & loyalty programs for the world's ...
---
Hello World! - GNU Project - Free Software Foundation (FSF)
---
Lesson: A Closer Look at the "Hello World!" Application (The Java ...
---
