<a href="https://colab.research.google.com/github/omeryarchi/Python_Beautiful_Soup_Web_Scraping/blob/master/Comprehensive_Python_Beautiful_Soup_Web_Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Libraries

In [2]:
import requests
from bs4 import BeautifulSoup as bs

# Load The Page

In [5]:
# Load the webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

# Convert to beautiful soup object
soup = bs(r.content)

#print the html
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



# Start using Beautiful Soup to scrape

### find and find_all

In [6]:
first_header = soup.find("h2")
print(first_header)

headers = soup.find_all("h2")
print(headers)

<h2>A Header</h2>
[<h2>A Header</h2>, <h2>Another header</h2>]


In [8]:
# Pass in a list of elements to look for
first_header = soup.find(["h2", "h1"])
print(first_header)

headers = soup.find_all(["h2", "h1"])
print(headers)

<h1>HTML Webpage</h1>
[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]


In [11]:
# Pass in attributes to the find/find_all func
paragraph = soup.find_all("p", attrs={"id":"paragraph-id"})
print(paragraph)

[<p id="paragraph-id"><b>Some bold text</b></p>]


In [13]:
body = soup.find("body")
div = body.find("div")
header = div.find("h1")
header

<h1>HTML Webpage</h1>

In [21]:
# Search for specific strings
import re
paragraph = soup.find_all("p", string=re.compile("Some"))
paragraph

headers = soup.find_all("h2", string=re.compile("(H|h)eader"))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

### Select (CSS selector)

In [23]:
 content = soup.select("div p")
 content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [25]:
paragraphs = soup.select("h2 ~ p")
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [26]:
bold_text = soup.select("p#paragraph-id b")
bold_text

[<b>Some bold text</b>]

### Get different properties of the HTML

In [30]:
# Use .string
header = soup.find("h2")
print(header)
print(header.string)

# If multiple child element use get_text
div = soup.find("div")
print(div.prettify())
print(div.get_text())


<h2>A Header</h2>
A Header
<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [34]:
# Get a specific property from an element
link = soup.find("a")
print(link["href"])

paragraphs = soup.select("p#paragraph-id")
print(paragraphs[0]["id"])

https://keithgalli.github.io/web-scraping/webpage.html
paragraph-id


### Code navigation

In [42]:
# Path syntax
soup.body.div.h1.string

'HTML Webpage'

In [45]:
print(soup.body.prettify())
soup.body.find("div").find_next_siblings()

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

### Load the webpage

##### https://keithgalli.github.io/web-scraping/webpage.html

In [46]:
r = requests.get("https://keithgalli.github.io/web-scraping/webpage.html")

# Convert to beautiful soup object
webpage = bs(r.content)

print(webpage.prettify())

<html>
 <head>
  <title>
   Keith Galli's Page
  </title>
  <style>
   table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
  </style>
 </head>
 <body>
  <h1>
   Welcome to my page!
  </h1>
  <img src="./images/selfie1.jpg" width="300px"/>
  <h2>
   About me
  </h2>
  <p>
   Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
  </p>
  <p>
   Here is a link to my channel:
   <a href="https://www.youtube.com/kgmi

#### Grab all the social links from the webpage

In [49]:
# Method 1
links = webpage.select("ul.socials a")
actual_links = [link["href"] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [53]:
# Method 2
ulist = webpage.find("ul", attrs={"class":"socials"})
links = ulist.find_all("a")
actual_links = [link["href"] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [55]:
# Method 3
links = webpage.select("li.social a")
actual_links = [link["href"] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

### Scrape the table

In [64]:
import pandas as pd

table = webpage.select("table.hockey-stats")[0]
columns = table.find("thead").find_all("th")
column_names = [column.string for column in columns]

table_rows = table.find("tbody").find_all("tr")
l = []
for tr in table_rows:
  td = tr.find_all("td")
  row = [str(tr.get_text()).strip() for tr in td]
  l.append(row)

df = pd.DataFrame(l, columns=column_names)
df.head()

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


### Grab all fun facta that use word "is"

In [66]:
facts = webpage.select("ul.fun-facts li")
facts_with_is = [fact.find(string=re.compile("is")) for fact in facts]
facts_with_is = [fact.find_parent().get_text() for fact in facts_with_is if fact]
facts_with_is

['Middle name is Ronald',
 'Dunkin Donuts coffee is better than Starbucks',
 "A favorite book series of mine is Ender's Game",
 'Current video game of choice is Rocket League',
 "The band that I've seen the most times live is the Zac Brown Band"]

### Download an Image

In [78]:
# Needs to be done locally

images = webpage.select("div.row div.column img")
image_url = images[0]["src"]

url = "https://keithgalli.github.io/web-scraping/"
full_url = url+image_url

img_data = requests.get(full_url).content
with open ("lake_como.jpg", "wb") as handler:
  handler.write(img_data)

### Scraping specific p id

In [84]:
files = webpage.select("div.block a")
relative_files = [f["href"] for f in files]
url = "https://keithgalli.github.io/web-scraping/"
words=[]
for f in relative_files:
  full_url = url+f
  page = requests.get(full_url)
  bs_page = bs(page.content)
  secret_word_element = bs_page.find("p", attrs={"id":"secret-word"})
  secret_word = secret_word_element.string
  words.append(secret_word)
  
print(words)

['Make', 'sure', 'to', 'smash', 'that', 'like', 'button', 'and', 'subscribe', '!!!']
