<a href="https://colab.research.google.com/github/m-rafiul-islam/data-science/blob/main/web_scraping_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Load in the necessary libraries

# link https://www.youtube.com/watch?v=GjKQ6V_ViQE&t=464s

In [None]:
import requests # pip install requests
from bs4 import BeautifulSoup as bs # pip install beautifulsoup4

## Load our first page

In [None]:
# Load the webpage content
r = requests.get("https://www.greatschools.org/iowa/des-moines/des-moines-independent-comm-school-district/schools/?view=table&gradeLevels=m")

# Convert to a beautiful soup object
soup = bs(r.content)

# Print out our html
# print(soup.prettify())

In [None]:
fst_header = soup.find("h1")
print(fst_header)

None


In [None]:
fst_header = soup.find_all("p")
print(fst_header)

[<p>GreatSchools is the leading nonprofit providing high-quality information that supports parents pursuing a great education for their child, schools striving for excellence, and communities working to diminish inequities in education.</p>, <p>GreatSchools is a 501(c)(3) non-profit organization. Support our mission.</p>, <p><a class="hidden-sm hidden-md donate" href="https://www.classy.org/checkout/donation?eid=147615" target="_blank">Donate Now</a></p>, <p><small>©1998-2022 GreatSchools.org All Rights Reserved. GreatSchools is a 501(c)(3) not-for-profit organization</small>
</p>]


In [None]:
# Load the webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

# Convert to a beautiful soup object
soup = bs(r.content)

# Print out our html
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



# Start using Beautiful Soup to Scrape

### find and find_all

In [None]:
first_header = soup.find("h2")

headers = soup.find_all("h2")
print(headers)

[<h2>A Header</h2>, <h2>Another header</h2>]


In [None]:
# Pass in a list of elements to look for 
first_header = soup.find(["h1", "h2"])

headers = soup.find_all(["h1", "h2"])
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [None]:
# You can pass in attributes to the find/find_all function
paragraph = soup.find_all("p")
paragraph

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [None]:
# You can pass in attributes to the find/find_all function
paragraph = soup.find_all("p", attrs={"id": "paragraph-id"})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [None]:
# You can nest find/find_all calls
body = soup.find('body')
body

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>

In [None]:
div = body.find('div')
header = div.find('h1')
header

<h1>HTML Webpage</h1>

In [None]:
soup.find_all('')

In [None]:
# We can search specific strings in our find/find_all calls
import re
paragraphs = soup.find_all("p", string=re.compile("Some"))
paragraphs

headers = soup.find_all("h2", string=re.compile("(H|h)eader"))
headers

[]

### select (CSS selector)

In [None]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [None]:
content = soup.select("div p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [None]:
paragraphs = soup.select("h2 ~ p")
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [None]:
bold_text = soup.select("p#paragraph-id b")
bold_text

[<b>Some bold text</b>]

In [None]:
paragraphs = soup.select("body > p")
print(paragraphs)
for paragraph in paragraphs:
  print(paragraph.select("i"))



[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<i>Some italicized text</i>]
[]


In [None]:
# Grab by element with specific property
soup.select("[align=middle]")

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

## Get different properties of the HTML

In [None]:
# use .string
header = soup.find("h2")
header.string

# If multiple child elements use get_text
div = soup.find("div")
print(div.prettify())
print(div.get_text())

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [None]:
div.string

In [None]:
# Get a specific property from an element
link = soup.find("a")
# link['href']
link['href']


'https://keithgalli.github.io/web-scraping/webpage.html'

In [None]:
paragraphs = soup.select("p#paragraph-id")
paragraphs[0]['id']

'paragraph-id'

## Code navigation 

In [None]:
# Path Syntax
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [None]:
soup.body.div.h1.string

'HTML Webpage'

In [None]:
# Know the terms: Parent, Sibling, Child
soup.body.find("div")

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [None]:
soup.body.find("div").find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

# Exercises!

Go to https://keithgalli.github.io/web-scraping/webpage.html



## Load the webpage





In [None]:
# Load the webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/webpage.html")

# Convert to a beautiful soup object
webpage = bs(r.content)

# Print out our html
print(webpage.prettify())

## Grab all of the social links from the webpage

Do this in at least 3 different ways



In [None]:
links = webpage.select('a')
links

[<a href="https://www.youtube.com/kgmit">youtube.com/kgmit</a>,
 <a href="#footer"><sup>1</sup></a>,
 <a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a>,
 <a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a>,
 <a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a>,
 <a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a>,
 <a href="https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2014-2015?tab=stats"> MIT (Mass. Inst. of Tech.) </a>,
 <a href="https://www.eliteprospects.com/league/acha-ii/stats/2014-2015"> ACHA II </a>,
 <a href="https://www.eliteprospects.com/league/acha-ii/stats/2014-2015"> </a>,
 <a href="https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2015-2016?tab=stats"> MIT (Mass. Inst. of Tech.) </a>,
 <a href="https://www.eliteprospects.com/league/acha-ii/stats/2015-2016"> ACHA II </a>,
 <a href="https://www.elite

In [None]:
links[-1]['href'] 

'challenge/file_10.html'

In [None]:
links = webpage.select('ul.socials a')
links

[<a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a>,
 <a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a>,
 <a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a>,
 <a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a>]

In [None]:
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [None]:
slink = webpage.find('ul', attrs={"class":"socials"})
slink.find_all("a") 

[<a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a>,
 <a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a>,
 <a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a>,
 <a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a>]

In [None]:
ulist = webpage.find("ul", attrs={"class": "socials"})
links = ulist.find_all("a")
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [None]:
links = webpage.select("li.social a")
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [None]:
links = webpage.select("body ul")
links

[<ul class="fun-facts">
 <li>Owned my dream car in high school <a href="#footer"><sup>1</sup></a></li>
 <li>Middle name is Ronald</li>
 <li>Never had been on a plane until college</li>
 <li>Dunkin Donuts coffee is better than Starbucks</li>
 <li>A favorite book series of mine is <i>Ender's Game</i></li>
 <li>Current video game of choice is <i>Rocket League</i></li>
 <li>The band that I've seen the most times live is the <i>Zac Brown Band</i></li>
 </ul>, <ul class="socials">
 <li class="social instagram"><b>Instagram: </b><a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a></li>
 <li class="social twitter"><b>Twitter: </b><a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a></li>
 <li class="social linkedin"><b>LinkedIn: </b><a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a></li>
 <li class="social tiktok"><b>TikTok: </b><a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.c

In [None]:
links = webpage.select("body ul li.social a")
links

[<a href="https://www.instagram.com/keithgalli/">https://www.instagram.com/keithgalli/</a>,
 <a href="https://twitter.com/keithgalli">https://twitter.com/keithgalli</a>,
 <a href="https://www.linkedin.com/in/keithgalli/">https://www.linkedin.com/in/keithgalli/</a>,
 <a href="https://www.tiktok.com/@keithgalli">https://www.tiktok.com/@keithgalli</a>]

## Exercise \#2: Grab all text on the webpage

Just get stuff above the Photos tag



In [None]:
header = webpage.body.find("h2", string="Photos")
previous_elements = header.find_previous_siblings()
previous_elements_sorted = previous_elements[::-1]
elements = [x.get_text() for x in previous_elements_sorted]
text = "\n".join(elements)
print(text)




Welcome to my page!

About me
Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
Here is a link to my channel: youtube.com/kgmit
I grew up in the great state of New Hampshire here in the USA. From an early age I always loved math. Around my senior year of high school, my brother first introduced me to programming. I found it a creative way to apply the same type of logical thinking skills that I enjoyed with math. This influenced me to study computer science in college and ultimately create a YouTube channel to share some things that I have learned along the way.
Hobbies
Believe it or not, I don't code 24/7. I love doing all sorts of active things. I like to play ice hockey & table tennis as well as run, hike, skateboard, and snowboard. In addition to sports, I am a board game enthusiast. The two that I've been playing the most recently are Settlers of Catan and Othello.
Fun Facts

Owned my dream car in high schoo

## Scrape the Table




In [None]:
# webpage.table

In [None]:
table = soup.select("table.table")
table

[]

In [None]:
table = webpage.select("table.hockey-stats")[0]
table

In [None]:
import pandas as pd
table = webpage.select("table.hockey-stats")[0]
columns = table.find("thead").find_all("th")
column_names = [c.string for c in columns]
column_names

['S',
 'Team',
 'League',
 'GP',
 'G',
 'A',
 'TP',
 'PIM',
 '+/-',
 '\xa0',
 'POST',
 'GP',
 'G',
 'A',
 'TP',
 'PIM',
 '+/-']

In [None]:
table_rows = table.find("tbody").find_all("tr")
table_rows

In [None]:
table_rows = table.find("tbody").find_all("tr")
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [str(tr.get_text()).strip() for tr in td]
    l.append(row)

df = pd.DataFrame(l, columns=column_names)
df.head()

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


## Grab all fun facts that use word "is"




In [None]:
import re

facts = webpage.select("ul.fun-facts li")
facts_with_is = [fact.find(string=re.compile("is")) for fact in facts]
facts_with_is = [fact.find_parent().get_text() for fact in facts_with_is if fact]
facts_with_is




['Middle name is Ronald',
 'Dunkin Donuts coffee is better than Starbucks',
 "A favorite book series of mine is Ender's Game",
 'Current video game of choice is Rocket League',
 "The band that I've seen the most times live is the Zac Brown Band"]

## Download an Image




In [None]:
# Done locally, but here is the code
import requests # pip install requests
from bs4 import BeautifulSoup as bs # pip install beautifulsoup4

# Load the webpage content
url = "https://keithgalli.github.io/web-scraping/"
r = requests.get(url+"webpage.html")

# Convert to a beautiful soup object
webpage = bs(r.content)

images = webpage.select("div.row div.column img")
image_url = images[0]['src']
full_url = url + image_url

img_data = requests.get(full_url).content
with open('lake_como.jpg', 'wb') as handler:
    handler.write(img_data)

## Solve the mystery challenge!




In [None]:
files = webpage.select("div.block a")
relative_files = [f['href'] for f in files]


url = "https://keithgalli.github.io/web-scraping/"
for f in relative_files:
  full_url = url + f
  page = requests.get(full_url)
  bs_page = bs(page.content)
  secret_word_element = bs_page.find("p", attrs={"id": "secret-word"})
  secret_word = secret_word_element.string
  print(secret_word)






Make
sure
to
smash
that
like
button
and
subscribe
!!!


# Email collection -- Iowa Teachers--didn't work

In [None]:
# Load the webpage content
r = requests.get("https://www.greatschools.org/iowa/des-moines/des-moines-independent-comm-school-district/schools/?view=table&gradeLevels=m")
# Convert to a beautiful soup object
soup = bs(r.content)
# Print out our html
# print(soup.prettify())

In [None]:
import pandas as pd
url = 'https://www.greatschools.org/iowa/des-moines/des-moines-independent-comm-school-district/schools/?view=table&gradeLevels=m'
table_ia = pd.read_html('https://keithgalli.github.io/web-scraping/webpage.html')
table_ia = pd.read_html(url)
table_ia

In [None]:
data = []
table = soup.find('table', attrs={'class':'lineItemsTable'})



In [None]:
tables = soup.findAll("table")


In [None]:
# We can search specific strings in our find/find_all calls
import re
paragraphs = soup.find_all("th", string=re.compile("Grades"))
paragraphs 



[]

In [None]:
str(soup.get_text()).strip()

'Middle Schools in Des Moines Independent Comm School District, 1-15 - Des Moines, IA | GreatSchools\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n//<![CDATA[\nwindow.gon={};gon.ad_set_targeting={"env":"prod","compfilter":"1","page_name":"GS:SchoolS","template":"search","City":"DesMoines","State":"ia","level":"m","county":"Polk"};gon.advertising_enabled=true;gon.ad_set_channel_ids="";gon.ab_value=null;gon.signed_in=false;gon.translations={"compare_test_scores_for":"Compare test scores for","compare_pinned_school":"COMPARE THIS SCHOOL TO SCHOOLS BELOW","from":"from","and":"and","to_nearby_schools":"to nearby schools within","Academics":"Academics","Saved!":"Saved!","Saved":"Saved","Save":"Save","No listings found":"No listings found","# of schools":"# of schools","# of students":"# of students","in":"in","no_info":"Currently, this information is unavailable. For more information","visit our FAQ page":"visit our FAQ page","at a glance":"at a glance","find_schools_in":"Find schools in","

In [None]:
# Load the webpage content
r = requests.get('https://www.greatschools.org/iowa/windsor-heights/552-Cowles-Elementary-School/')
# Convert to a beautiful soup object
soup = bs(r.content)
# Print out our html
# print(soup.prettify())

In [None]:
from pandas.core.indexes.api import all_indexes_same
all_links = soup.find_all('a')
type(all_links) 
link = []
for i in range(len(all_links)):
  print(all_links[i])
  # link = all_links[i]['href']


<a class="js-gaClick" data-ga-click-action="NavClick" data-ga-click-category="Nav" data-ga-click-label="Home" href="/">
<div></div>
</a>
<a class="js-gaClick" data-ga-click-action="NavClick" data-ga-click-category="Nav" data-ga-click-label="Home" href="/">
<div></div>
</a>
<a class="change-language jsChangeLanguageLink" href="#"><img alt="Lang icon es" src="/assets/nav/lang_icon_es-c83037e51b399b18ed0b13fe30349b3b.svg"/></a>
<a class="saved-schools-nav js-gaClick" data-ga-click-action="NavClick" data-ga-click-category="Nav" data-ga-click-label="School List" data-method="get" href="/my-school-list/">
<span class="content-desktop">School List</span>
<span class="content-mobile">List</span>
<span></span>
</a>
<a class="js-desktop-search-toggle-link js-school-search"> Schools </a>
<a class="dropdown-item js-desktop-search-dropdown-item">Parenting</a>
<a href="/gk/" onclick="return false;">Parenting <span style="font-size:10px;"></span></a>
<a href="/gk/" onclick="return false;">Raising kid

In [None]:
all_links[0]['href']

<a class="js-gaClick" data-ga-click-action="NavClick" data-ga-click-category="Nav" data-ga-click-label="Home" href="/">
<div></div>
</a>