### <font color = 'orange'>load in the necesery libraries 

In [4]:
import requests
from bs4 import BeautifulSoup as bs

### <font color = 'orange'>load the first page 

In [7]:
# load the webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

#convert to beautiful soup object 
soup = bs(r.content)

#print out our html
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



### <font color = 'orange'> start scraping using beautiful soup library 

In [30]:
# returns the first one it will find
first_header = soup.find("h2")
print(first_header,end='\n\n****\n')

# returns a list of all headers 
headers = soup.find_all("h2")
print(headers)

<h2>A Header</h2>

****
[<h2>A Header</h2>, <h2>Another header</h2>]


In [31]:
# pass in a list of elements to look for

first_header =soup.find(["h1","h2"])
print(first_header,end='\n\n****\n')

headers = soup.find_all(["h1","h2"])
print(headers)

<h1>HTML Webpage</h1>

****
[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]


In [32]:
# you can pass in attributes to the find/find_all function

paragraph = soup.find_all("p",attrs = {"id":"paragraph-id"})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [34]:
# you can nest find/find_all calls
# narrow down the search -> step by step 

body = soup.find('body')
print(body,end='\n\n****\n')
div = body.find('div')
print(div,end='\n\n****\n')
header = div.find('h1')
print(header)

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>

****
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

****
<h1>HTML Webpage</h1>


In [36]:
# lets say we want to find all paragraphs where the word "Some" occurs this way wont find it - we need the whole string
paragraphs = soup.find_all("p",string = "Some")

paragraphs = soup.find_all("p",string = "Some italicized text")
paragraphs


[<p><i>Some italicized text</i></p>]

## this is not ideal so we need regex to find what we want <br> part of a string or capital/lower case string

In [41]:
import re

# will find all occurences of a word "Some"
paragraphs = soup.find_all("p",string = re.compile("Some"))
print(paragraphs,end = '\n\n***\n')

# and to make it not case sensitive we use regex again 
headers = soup.find_all("h2",string = re.compile("(H|h)eader"))
print(headers)

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]

***
[<h2>A Header</h2>, <h2>Another header</h2>]


### <font color = 'orange'>select (CSS selector)

In [43]:
# just have a look at the body to see again 
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [44]:
content = soup.select("p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [45]:
# to get the paragraph that is inside the div 
content = soup.select("div p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [46]:
# all the paragraphs that are precided with the header 2 
paragraphs = soup.select("h2 ~ p")
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [47]:
# specific elements with id's
bold_text = soup.select("p#paragraph-id b")
bold_text

[<b>Some bold text</b>]

In [51]:
# run nested call 
paragraphs = soup.select("body > p")
print(paragraphs,end = '\n\n***\n')

for paragraph in paragraphs:
    print(paragraph.select("i"))

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]

***
[<i>Some italicized text</i>]
[]


In [52]:
# grab the element with specific property 
soup.select("[align=middle]")

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

### <font color = 'orange'>get different properties of the 
    

In [54]:
header =soup.find("h2")
print(header)

# this will get just the text inside 
print(header.string)


<h2>A Header</h2>
A Header


In [56]:
# doing this with div wont work because it will get confused what to print - h1 or p as they are indented the same way 
div = soup.find("div")
print(div.string)

None


In [57]:
print(div.prettify())

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>



In [59]:
# if multiple child elements use .get_text()
print(div.get_text())


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [60]:
# Get a specific property from an element 
# like a url 

link = soup.find("a")
link

<a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a>

In [61]:
link['href']

'https://keithgalli.github.io/web-scraping/webpage.html'

In [63]:
# to get properties we use []
paragraphs = soup.select("p#paragraph-id")
paragraphs

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [65]:
paragraphs[0]["id"]

'paragraph-id'

### <font color = 'orange'>code navigation

In [70]:
# Path syntax
print(soup.body)
print('\n***\n')
print(soup.body.div)
print('\n***\n')
print(soup.body.div.h1)
print('\n***\n')
print(soup.body.div.h1.string)

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>

***

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

***

<h1>HTML Webpage</h1>

***

HTML Webpage


In [74]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [72]:
# Know the terms: parent, sibling, child 
soup.body.find("div").find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]