# Parsing Basic HTML Page

In [None]:
from bs4 import BeautifulSoup as bs
import requests

In [8]:
link="http://www.apptronix.net/webscrap/demo1.html"

In [9]:
page=requests.get(link)

In [10]:
page

<Response [200]>

In [11]:
page.content

b'<!DOCTYPE html>\n<html>\n<head>Head Of The Page</head>\n<body>\n<p>Simple Paragraph</p>\n</body>\n</html>'

In [12]:
soup = bs(page.content,'html.parser')# To represent the url and parse it

In [14]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  Head Of The Page
 </head>
 <body>
  <p>
   Simple Paragraph
  </p>
 </body>
</html>


In [15]:
list(soup.children)

['html',
 '\n',
 <html>
 <head>Head Of The Page</head>
 <body>
 <p>Simple Paragraph</p>
 </body>
 </html>]

In [18]:
html=list(soup.children)[2]

In [19]:
print(html)

<html>
<head>Head Of The Page</head>
<body>
<p>Simple Paragraph</p>
</body>
</html>


In [20]:
list(html.children)

['\n',
 <head>Head Of The Page</head>,
 '\n',
 <body>
 <p>Simple Paragraph</p>
 </body>,
 '\n']

In [21]:
body=list(html.children)[3]

In [22]:
body

<body>
<p>Simple Paragraph</p>
</body>

In [23]:
p=list(body.children)

In [24]:
p

['\n', <p>Simple Paragraph</p>, '\n']

In [25]:
l=list(body.children)[1]

In [26]:
l

<p>Simple Paragraph</p>

In [28]:
data=l.get_text()

In [29]:
print(data)

Simple Paragraph


# Finding all instances of a tag at once

In [30]:
from bs4 import BeautifulSoup as bs
import requests

In [31]:
link="http://www.apptronix.net/webscrap/demo2.html"

In [33]:
page=requests.get(link)

In [34]:
print(page)

<Response [200]>


In [37]:
page.content

b'<!DOCTYPE html>\n<html>\n<head>Head Of The Page</head>\n<body>\n<p>First Paragraph</p>\n<p>Second Paragraph</p>\n<p>Third Paragraph</p>\n</body>\n</html>'

In [38]:
soup=bs(page.content,'html.parser')

In [39]:
print(soup)

<!DOCTYPE html>

<html>
<head>Head Of The Page</head>
<body>
<p>First Paragraph</p>
<p>Second Paragraph</p>
<p>Third Paragraph</p>
</body>
</html>


In [40]:
soup.find_all('p')

[<p>First Paragraph</p>, <p>Second Paragraph</p>, <p>Third Paragraph</p>]

In [42]:
soup.find_all('p')[0].get_text()

'First Paragraph'

In [43]:
soup.find_all('p')[1].get_text()

'Second Paragraph'

In [44]:
soup.find_all('p')[2].get_text()

'Third Paragraph'

# Searching the tags by Class or ID

In [69]:
from bs4 import BeautifulSoup as bs
import requests

In [70]:
link="http://www.apptronix.net/webscrap/demo3.html"

In [71]:
page=requests.get(link)

In [72]:
print(page)

<Response [200]>


In [73]:
page.content

b'<html>\n<head>\n<title>A simple page</title>\n</head>\n<body>\n<div>\n<p class="inner-text first-item" id="first">\nFirst paragraph.\n</p>\n<p class="inner-text">\nSecond paragraph.\n</p>\n</div>\n<p class="outer-text first-item" id="second">\n<b>\nFirst outer paragraph.\n</b>\n</p>\n<p class="outer-text">\n<b>\nSecond outer paragraph.\n</b>\n</p>\n<a href = \'http://www.google.com\' id=\'link1\'>Google</a>\n<a href = \'http://www.facebook.com\' id=\'link2\'>Facebook</a>\n<a href = \'http://www.instagram.com\' id=\'link3\'>Instagram</a>\n</body>\n</html>'

In [74]:
soup=bs(page.content,'html.parser')

In [75]:
print(soup.prettify())

<html>
 <head>
  <title>
   A simple page
  </title>
 </head>
 <body>
  <div>
   <p class="inner-text first-item" id="first">
    First paragraph.
   </p>
   <p class="inner-text">
    Second paragraph.
   </p>
  </div>
  <p class="outer-text first-item" id="second">
   <b>
    First outer paragraph.
   </b>
  </p>
  <p class="outer-text">
   <b>
    Second outer paragraph.
   </b>
  </p>
  <a href="http://www.google.com" id="link1">
   Google
  </a>
  <a href="http://www.facebook.com" id="link2">
   Facebook
  </a>
  <a href="http://www.instagram.com" id="link3">
   Instagram
  </a>
 </body>
</html>


In [76]:
soup.find_all('p',class_='outer-text') # Search with class

[<p class="outer-text first-item" id="second">
 <b>
 First outer paragraph.
 </b>
 </p>,
 <p class="outer-text">
 <b>
 Second outer paragraph.
 </b>
 </p>]

In [77]:
soup.find_all('p',id='first') # Search with id

[<p class="inner-text first-item" id="first">
 First paragraph.
 </p>]

# Using CSS (Cascading Style Sheets)

In [78]:
soup.select("div p") # search tag inside one tag

[<p class="inner-text first-item" id="first">
 First paragraph.
 </p>,
 <p class="inner-text">
 Second paragraph.
 </p>]

In [79]:
soup.select("div p.first-item")#Search with first-item

[<p class="inner-text first-item" id="first">
 First paragraph.
 </p>]

In [80]:
soup.select("div p#first")# search with id

[<p class="inner-text first-item" id="first">
 First paragraph.
 </p>]

In [82]:
soup.select("body p.outer-text") #Search inside body with class =outer-text

[<p class="outer-text first-item" id="second">
 <b>
 First outer paragraph.
 </b>
 </p>,
 <p class="outer-text">
 <b>
 Second outer paragraph.
 </b>
 </p>]

In [84]:
soup.find_all('a')

[<a href="http://www.google.com" id="link1">Google</a>,
 <a href="http://www.facebook.com" id="link2">Facebook</a>,
 <a href="http://www.instagram.com" id="link3">Instagram</a>]

In [85]:
soup.find(id='link3')

<a href="http://www.instagram.com" id="link3">Instagram</a>

In [86]:
my_links=soup.find_all('a')

In [90]:
links=[]
for link in my_links:
    links.append(link.get('href'))

In [91]:
links

['http://www.google.com',
 'http://www.facebook.com',
 'http://www.instagram.com']