## Web_Scraping

### Loading libraries

In [1]:
import requests
from bs4 import BeautifulSoup as bs

### Load first webpage

In [2]:
# load the webpage content
r = requests.get('https://keithgalli.github.io/web-scraping/example.html')

# convert to beautiful soup object
soup = bs(r.content)

# print out our html
print(soup) # soup.prettify()  will print with indentations

<html>
<head>
<title>HTML Example</title>
</head>
<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>
</html>



### Start using bs to Scrape

In [3]:
# find()
# find_all()

In [4]:
first_header = soup.find('h2')  # finds the first element that matches
first_header

<h2>A Header</h2>

In [5]:
headers = soup.find_all('h2')
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [6]:
# pass in a list of elements to look for
first_header = soup.find(['h1', 'h2']) # finds first occurence of one of those elements in the list
first_header

<h1>HTML Webpage</h1>

In [7]:
headers = soup.find_all(['h2', 'h1']) # finds all occurences of all elements in the list
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [8]:
# pass in attributes to the find/find_all funtion
paragraph = soup.find_all("p", attrs={"id" : "paragraph-id"})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [9]:
# u can nest find/find_all 
body = soup.find('body')
div = body.find('div')
header = div.find('h1')
header

<h1>HTML Webpage</h1>

In [10]:
# search specific strings in our find/findall cells
para = soup.find_all('p', string = 'Some') # here we need to give full string 
para1 = soup.find_all('p', string = 'Some bold text')
print(para, para1)

# we can use regx library 
import re
paragraph = soup.find_all('p', string = re.compile('Some'))
print(paragraph)

header = soup.find_all('h2', string = re.compile('(H|h)eader'))
header

[] [<p id="paragraph-id"><b>Some bold text</b></p>]
[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]


[<h2>A Header</h2>, <h2>Another header</h2>]

### Select (CSS selector)

In [11]:
content = soup.select('p')
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [12]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [13]:
cont = soup.select('div p')
cont

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [14]:
paragraphs = soup.select('body > p')
print(paragraphs)

for paragraph in paragraphs:
    print(paragraph.select('i'))

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<i>Some italicized text</i>]
[]


### Getting different properties of the HTML

In [15]:
# to get the header name only
header = soup.find('h2')
header.string

div = soup.find('div')
print(div.prettify())
print(div.string) # none 'cause multiple tags in side the div
print(div.get_text())  # to get only content from different tags

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>

None

HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [16]:
# get a specific property form an element
link = soup.find('a')
link['href']

paragraph = soup.select('p#paragraph-id')
paragraph[0]['id']

'paragraph-id'

### Code navigation

In [17]:
# path syntax
soup.body.div.h1.string
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [18]:
# know the terms: parent, sibling, child

soup.body.find('div').find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [19]:
keith = 'https://keithgalli.github.io/web-scraping/webpage.html'

In [20]:
# load webpage 
r = requests.get(keith)
webpage = bs(r.content)
print(webpage)

<html><head>
<title>Keith Galli's Page</title>
<style>
  table {
    border-collapse: collapse;
  }
  th {
    padding:5px;
  }
  td {
    border: 1px solid #ddd;
    padding: 5px;
  }
  tr:nth-child(even) {
    background-color: #f2f2f2;
  }
  th {
    padding-top: 12px;
    padding-bottom: 12px;
    text-align: left;
    background-color: #add8e6;
    color: black;
  }
  .block {
  width: 100px;
  /*float: left;*/
    display: inline-block;
    zoom: 1;
  }
  .column {
  float: left;
  height: 200px;
  /*width: 33.33%;*/
  padding: 5px;
  }

  .row::after {
    content: "";
    clear: both;
    display: table;
  }
</style>
</head>
<body>
<h1>Welcome to my page!</h1>
<img src="./images/selfie1.jpg" width="300px"/>
<h2>About me</h2>
<p>Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!</p>
<p>Here is a link to my channel: <a href="https://www.youtube.com/kgmit">youtube.com/kgmit</a></p>
<p>I grew up in the great s

### Task1 
#### get all social media links

In [21]:
for i in webpage.body.find_all('ul')[1].find_all('li'):
    print(i.find('b').string,i.find('a')['href'])

Instagram:  https://www.instagram.com/keithgalli/
Twitter:  https://twitter.com/keithgalli
LinkedIn:  https://www.linkedin.com/in/keithgalli/
TikTok:  https://www.tiktok.com/@keithgalli


In [22]:
for i in webpage.body.select('ul')[1].select('li'):
    print(i.b.string,i.a['href'])

Instagram:  https://www.instagram.com/keithgalli/
Twitter:  https://twitter.com/keithgalli
LinkedIn:  https://www.linkedin.com/in/keithgalli/
TikTok:  https://www.tiktok.com/@keithgalli


In [23]:
# keith
links = webpage.select('ul.socials a')
#print(links)
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

In [24]:
links = webpage.select('li.social a')
a_links = [link['href'] for link in links]
a_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

### Task2
#### Scrape the table on the page

In [25]:
import pandas as pd

In [26]:
table = webpage.table
columns = []
for i in table.tr.find_all('th'):
    columns.append(i.string)
columns

['S',
 'Team',
 'League',
 'GP',
 'G',
 'A',
 'TP',
 'PIM',
 '+/-',
 '\xa0',
 'POST',
 'GP',
 'G',
 'A',
 'TP',
 'PIM',
 '+/-']

In [27]:
rows = table.find('tbody').find_all('tr')
rows[0].find_all('td')[1].get_text()[4:-3]
#rows[0].find_all('td')[1]

'MIT (Mass. Inst. of Tech.)'

In [28]:
rows[0]

<tr class="team-continent-NA">
<td class="season sorted">
                  2014-15
              </td>
<td class="team">
<i><img src="images/flag.png"/></i>
<span class="txt-blue">
<a href="https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2014-2015?tab=stats"> MIT (Mass. Inst. of Tech.) </a>
</span>
</td>
<td class="league"> <a href="https://www.eliteprospects.com/league/acha-ii/stats/2014-2015"> ACHA II </a> </td>
<td class="regular gp">17</td>
<td class="regular g">3</td>
<td class="regular a">9</td>
<td class="regular tp">12</td>
<td class="regular pim">20</td>
<td class="regular pm"></td>
<td class="separator"> | </td>
<td class="postseason">
<a href="https://www.eliteprospects.com/league/acha-ii/stats/2014-2015"> </a>
</td>
<td class="postseason gp">
</td>
<td class="postseason g">
</td>
<td class="postseason a">
</td>
<td class="postseason tp">
</td>
<td class="postseason pim">
</td>
<td class="postseason pm">
</td>
</tr>

In [32]:
row_data = []
for r in rows:
    td = r.find_all('td')
    data = [str(j.get_text()).strip() for j in td]
    row_data.append(data)
Data_frame = pd.DataFrame(row_data, columns = columns)
Data_frame.head()

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


### Task3
#### Funfacts

In [43]:
str(webpage.body.select('ul.fun-facts')[0].get_text()).split('\n')

['',
 'Owned my dream car in high school 1',
 'Middle name is Ronald',
 'Never had been on a plane until college',
 'Dunkin Donuts coffee is better than Starbucks',
 "A favorite book series of mine is Ender's Game",
 'Current video game of choice is Rocket League',
 "The band that I've seen the most times live is the Zac Brown Band",
 '']

In [52]:
images = webpage.body.select('div.row img')
image_url= images[0]['src']

ful_url = keith + image_url # we need to create ful url for image
img_data = requests.get(ful_url).content
with open('lake_como.jpg', 'wb') as handler:
    handler.write(img_data)

### Task5
#### Secret message

In [73]:
files = [ a['href'] for a in webpage.body.select('div.block li a')]
files_url = [keith[:-12] + f for f in files]
files_url

['https://keithgalli.github.io/web-scraping/challenge/file_1.html',
 'https://keithgalli.github.io/web-scraping/challenge/file_2.html',
 'https://keithgalli.github.io/web-scraping/challenge/file_3.html',
 'https://keithgalli.github.io/web-scraping/challenge/file_4.html',
 'https://keithgalli.github.io/web-scraping/challenge/file_5.html',
 'https://keithgalli.github.io/web-scraping/challenge/file_6.html',
 'https://keithgalli.github.io/web-scraping/challenge/file_7.html',
 'https://keithgalli.github.io/web-scraping/challenge/file_8.html',
 'https://keithgalli.github.io/web-scraping/challenge/file_9.html',
 'https://keithgalli.github.io/web-scraping/challenge/file_10.html']

In [81]:
for f in files_url:
    page = requests.get(f)
    bs_p = bs(page.content)
    print(bs_p.body.find('p', attrs = {'id': 'secret-word'}).string)

Make
sure
to
smash
that
like
button
and
subscribe
!!!
