# Chapter 5. Advanced HTML Parsing

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('https://pythonscraping.com/pages/warandpeace.html')
bs = BeautifulSoup(html.read(), 'html.parser')

In [None]:
namelist = bs.find_all('span', {'class': 'green'})

In [None]:
len(namelist)

41

In [None]:
for i, name in enumerate(namelist):
    print(name)
    print(type(name))
    print('--------')
    print(name.get_text())
    print(type(name.get_text()))
    print()
    if i > 1:
        break

<span class="green">Anna
Pavlovna Scherer</span>
<class 'bs4.element.Tag'>
--------
Anna
Pavlovna Scherer
<class 'str'>

<span class="green">Empress Marya
Fedorovna</span>
<class 'bs4.element.Tag'>
--------
Empress Marya
Fedorovna
<class 'str'>

<span class="green">Prince Vasili Kuragin</span>
<class 'bs4.element.Tag'>
--------
Prince Vasili Kuragin
<class 'str'>



In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('https://pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html, 'html.parser')

In [None]:
children = []
for child in bs.find('table', {'id': 'giftList'}).children:
    children.append(child)
len(children)

13

In [None]:
descendants = []
for descendant in bs.find('table', {'id': 'giftList'}).descendants:
    descendants.append(descendant)
len(descendants)

86

In [None]:
for child in bs.find('table', {'id': 'giftList'}).children:
    print(child)



<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>


<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>


In [None]:
# skip the first row (header)
for sibling in bs.find('table', {'id': 'giftList'}).tr.next_siblings:
    print(sibling)



<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>
</td></tr>


<tr class="gift" id="gift4"><td>
Dead Parrot
</td><td>
This is an ex-parr

In [None]:
bs.find('img', {'src': '../img/gifts/img1.jpg'})

<img src="../img/gifts/img1.jpg"/>

In [None]:
bs.find('img', {'src': '../img/gifts/img1.jpg'}).parent

<td>
<img src="../img/gifts/img1.jpg"/>
</td>

In [None]:
bs.find('img', {'src': '../img/gifts/img1.jpg'}).parent.previous_sibling

<td>
$15.00
</td>

In [None]:
bs.find('img', {'src': '../img/gifts/img1.jpg'}).parent.previous_sibling.get_text()

'\n$15.00\n'

In [None]:
print(bs.find('img', {'src': '../img/gifts/img1.jpg'}).parent.previous_sibling.get_text())


$15.00



In [None]:
bs.find_all('img')

[<img src="../img/gifts/logo.jpg" style="float:left;"/>,
 <img src="../img/gifts/img1.jpg"/>,
 <img src="../img/gifts/img2.jpg"/>,
 <img src="../img/gifts/img3.jpg"/>,
 <img src="../img/gifts/img4.jpg"/>,
 <img src="../img/gifts/img6.jpg"/>]

In [None]:
import re

bs.find_all('img', {'src': re.compile('../img/gifts/img.*.jpg')})

[<img src="../img/gifts/img1.jpg"/>,
 <img src="../img/gifts/img2.jpg"/>,
 <img src="../img/gifts/img3.jpg"/>,
 <img src="../img/gifts/img4.jpg"/>,
 <img src="../img/gifts/img6.jpg"/>]

In [None]:
images = bs.find_all('img', {'src': re.compile('../img/gifts/img.*.jpg')})
for image in images:
    print(image['src'])

../img/gifts/img1.jpg
../img/gifts/img2.jpg
../img/gifts/img3.jpg
../img/gifts/img4.jpg
../img/gifts/img6.jpg


In [None]:
images[0].attrs['src']

'../img/gifts/img1.jpg'

In [None]:
len(images[0].attrs)

1

In [None]:
bs.find_all(lambda tag: len(tag.attrs) == 2)

[<img src="../img/gifts/logo.jpg" style="float:left;"/>,
 <tr class="gift" id="gift1"><td>
 Vegetable Basket
 </td><td>
 This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
 <span class="excitingNote">Now with super-colorful bell peppers!</span>
 </td><td>
 $15.00
 </td><td>
 <img src="../img/gifts/img1.jpg"/>
 </td></tr>,
 <tr class="gift" id="gift2"><td>
 Russian Nesting Dolls
 </td><td>
 Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
 </td><td>
 $10,000.52
 </td><td>
 <img src="../img/gifts/img2.jpg"/>
 </td></tr>,
 <tr class="gift" id="gift3"><td>
 Fish Painting
 </td><td>
 If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
 </td><td>
 $10,005.00
 </td><td>
 <img src="../img/gifts/img3.jpg"/>
 </td>

In [None]:
bs.find_all(lambda tag: tag.get_text() == 'Or maybe he\'s only resting?')

[<span class="excitingNote">Or maybe he's only resting?</span>]

In [None]:
bs.find_all(string='Or maybe he\'s only resting?')

["Or maybe he's only resting?"]

In [None]:
bs.find_all('span', string='Or maybe he\'s only resting?')

[<span class="excitingNote">Or maybe he's only resting?</span>]