Load in the necessary libraries

In [None]:
import requests
from bs4 import BeautifulSoup as bs

Load first page


In [None]:
# Load the webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

#convert to beautiful soup object
soup = bs(r.content)

#print out html
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



Start scrapping with Soup Library

find and find_all

In [None]:
first_header = soup.find("h2")
header = soup.find_all("h2")
header

[<h2>A Header</h2>, <h2>Another header</h2>]

In [None]:
#pass in a list of elements to look for
first_header = soup.find(["h1","h2"])
first_header
header = soup.find_all(["h1","h2"])
header

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [None]:
#You can pass attributes in find/findAll functions
paragraph = soup.find_all("p", attrs= {"id":"paragraph-id"})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [None]:
#you can nest find/find_all calls
body = soup.find("body")
div = body.find("div")
header = div.find("h1")
header

<h1>HTML Webpage</h1>

In [None]:
#You can search specific skill in find/find_all
import re
paragraphs = soup.find_all("p",string = re.compile("Some"))
paragraphs

headers = soup.find_all("h2", string = re.compile("(h|H)eader"))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

Select (CSS Selector)

In [None]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [None]:
content = soup.select("div p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [None]:
paragraphs = soup.select("h2 ~ p")
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [None]:
bold_text = soup.select("p#paragraph-id b")
bold_text

[<b>Some bold text</b>]

In [None]:
paragraphs = soup.select("body > p")
print(paragraphs)

for paragraph in paragraphs:
  print(paragraph.select('i'))

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<i>Some italicized text</i>]
[]


In [None]:
# Grab different elements with specific property

soup.select("[align=middle]")

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

## Get different properties of HTML

In [18]:
header = soup.find("h2")
header.string

#if multiple child use get_text
div = soup.find("div")
print(div.get_text())


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [23]:
# Get a specific property from an element
link = soup.find("a")
link['href']

paragraphs = soup.select("p#paragraph-id")
paragraphs[0]['id']

'paragraph-id'

In [24]:
# Path Syntax
soup.body.div.h1.string

'HTML Webpage'

In [26]:
# Know the parent, siblings, child
soup.body.find("div").find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

#### Exercises!

Go to https://keithgalli.github.io/web-scraping/webpage.html

In [None]:
# Load the webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/webpage.html")

#convert to beautiful soup object
webpage = bs(r.content)

#print out html
print(webpage.prettify())

## Grab all of the social links from webpage

Do this in at least 3 different ways

In [105]:
# Way 1
socials = webpage.find_all("ul", {"class":"socials"})
links=social.find_all("a")
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [106]:
#Way 2
links =  webpage.select("ul.socials a")
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [110]:
# Way 3
links = webpage.select("li.social a")
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

###Scraping Table from website

In [None]:
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)
pd.DataFrame(l, columns=["A", "B", ...]

In [133]:
import pandas as pd

table = webpage.select("table.hockey-stats")[0]
#table_rows = table.find_all('tr')
columns = table.find("thead").find_all("th")
column_names = [c.string for c in columns]

table_rows = table.find("tbody").find_all("tr")
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [str(tr.get_text()).strip() for tr in td]
    l.append(row)

df = pd.DataFrame(l, columns=column_names)
df.loc[df['Team'] != "Did not play"]

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17,3,9,12,20,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9,1,1,2,2,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12,5,5,10,8,0.0,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8,5,10,15,8,,|,,,,,,,


##Grab the fun facts that has the word "is" in it

In [142]:
import re
fun_facts = webpage.select("ul.fun-facts li")
facts_with_is = [fun_fact.find(string=re.compile("is")) for fun_fact in fun_facts]
facts_with_is = [fact.find_parent().get_text() for fact in facts_with_is if fact]
facts_with_is

['Middle name is Ronald',
 'Dunkin Donuts coffee is better than Starbucks',
 "A favorite book series of mine is Ender's Game",
 'Current video game of choice is Rocket League',
 "The band that I've seen the most times live is the Zac Brown Band"]

# Use Beautiful Soup to download image from a website

In [182]:
#Use this code in a local IDE to work
import requests
from bs4 import BeautifulSoup as bs

# Load the webpage content
url = "https://keithgalli.github.io/web-scraping/"
r = requests.get(url+"webpage.html")

#convert to beautiful soup object
webpage = bs(r.content)

images = webpage.select("div.row div.column img")
image_url = images[0]['src']
full_url = url + image_url

import urllib.request
#urllib.request.urlretrieve(full_url, "lake_como.jpg")

# Secret Message Exercise

In [181]:
url = "https://keithgalli.github.io/web-scraping/"
secrets = webpage.select("div.block li a")
secrets[0]["href"]

for x in range(0,10):
  r = requests.get(url+secrets[x]["href"])
  sec = bs(r.content)
  b = sec.find("p", attrs = {"id":"secret-word"})
  secret_word = b.string 
  print(secret_word) 

Make
sure
to
smash
that
like
button
and
subscribe
!!!
