In [1]:
from bs4 import BeautifulSoup

## BeautifulSoup

1. A python library for pulling information from the web. 
2. It is a content extractor which means it needs to get the source of a website to be able to do parsing.

#### Builtin features are:
    1. Navigation
    2. Modification
    3. Searching


##### Create Soup Object

1. From website source

In [None]:
from requests import get
response = get("http://pycon.pk")
soup = BeautifulSoup(response.content)

2. From source file

3. From direct source

In [69]:
markup = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormous
e's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(markup)

In [3]:
soup

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormous
e's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

### Parsers:
#### By default, BeautifulSoup parses documents as HTML. What if, we have got xml markup? For this, we have to pass type of parser while creating object. Type of parsers:
1. Python’s html.parser                                                                  
2. lxml’s XML parser
3. lxml’s HTML parser                                                                      
4. html5lib


### Kind of objects

#### Tag type

In [62]:
type(soup.head)

bs4.element.Tag

#### Tag name

In [63]:
soup.head.name

'head'

#### Tag attributes

In [65]:
soup.p.attrs

{'class': ['title']}

#### Tag class

In [66]:
soup.p["class"]

['title']

### Navigation

#### Using tag names

In [70]:
soup.head.title

<title>The Dormouse's story</title>

#### Contents

In [5]:
soup.head.title.contents

["The Dormouse's story"]

#### Parent

In [71]:
soup.head.parent

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormous
e's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

In [74]:
head_content = soup.head.parent

In [75]:
head_content.contents

[<head><title>The Dormouse's story</title></head>, '\n', <body>
 <p class="title"><b>The Dormous
 e's story</b></p>
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>
 <p class="story">...</p>
 </body>]

In [96]:
head_tag = soup.head

#### Children

In [10]:
for tag in soup.children:
    print("-", tag)

- <html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormous
e's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>


#### Descendents

In [76]:
for tag_num, tag in enumerate(soup.descendants):
    print(tag_num, tag)
    print("===================================")

0 <html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormous
e's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>
1 <head><title>The Dormouse's story</title></head>
2 <title>The Dormouse's story</title>
3 The Dormouse's story
4 

5 <body>
<p class="title"><b>The Dormous
e's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they 

In [77]:
soup.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

#### Next/Previous Sibling

In [78]:
soup.a.next_siblings


<generator object PageElement.next_siblings at 0x0000027010A1BC00>

In [128]:
for sibling in soup.find(id="link3").previous_siblings:
    print(repr(sibling))

' and\n'
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
',\n'
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
'Once upon a time there were three little sisters; and their names were\n'


### Searching

In [79]:
soup

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormous
e's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

#### Tag names

In [80]:
soup.find_all("p")

[<p class="title"><b>The Dormous
 e's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

#### Regular expression

In [13]:
from re import compile
soup.find_all(compile("^a"))

[<head><title>The Dormouse's story</title></head>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

#### Multiple tag filtration

In [132]:
soup.find_all(["head", compile("^a")])

[<head><title>The Dormouse's story</title></head>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

#### String

In [81]:
soup.find(string=["barkat","Elsie"])

'Elsie'

#### Check existence of an attribute

In [82]:
soup.select('p[href]')


[]

### Modification

In [84]:
markup = '<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>'
soup = BeautifulSoup(markup)

In [85]:
tag = soup.a

In [86]:
tag

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

#### Rename a tag

In [30]:
tag.name = "a_tag"
tag

<a_tag class="sister" href="http://example.com/elsie" id="link1">Elsie</a_tag>

#### Rename a tag class

In [31]:
tag['class'] = "brother"
tag

<a_tag class="brother" href="http://example.com/elsie" id="link1">Elsie</a_tag>

In [32]:
tag.contents

['Elsie']

#### Edit tag content

In [88]:
tag.string = "Barkat"

In [90]:
tag.contents

['Barkat']

In [89]:
tag

<a class="sister" href="http://example.com/elsie" id="link1">Barkat</a>

In [52]:
tag.append(" is studying")
tag

<a class="sister" href="http://example.com/elsie" id="link1">Elsie is studying</a>

#### Insert more text on content

In [53]:
tag.insert(2, "but did not get endoresement")
tag

<a class="sister" href="http://example.com/elsie" id="link1">Elsie is studyingbut did not get endoresement</a>

#### clear

In [54]:
tag.clear()
tag

<a class="sister" href="http://example.com/elsie" id="link1"></a>

### Pros
1. Cross platform 
2. Handles nicely malformed markup
3. Supports python2 and python3.

### Cons
1. Slow
2. Parse one request at a time

In [56]:
from requests import get 
from bs4 import BeautifulSoup


resp = get("http://pycon.pk/speakers/2019/")
soup = BeautifulSoup(resp.content)


In [63]:
members = soup.find_all("div", attrs={"class":"member-desc"})

In [67]:
for m in members:
    print(m.h3.contents)

['Van Lindberg']
['Dr. Noman Islam']
['M. Junaid Muzammil']
['Ali Asad Lotia']
['Muhammad Taha Anwar']
['Ammara Laeeq']
['Waqar Saleem']
['Tahir Ramzan']
['Ali Zain Banatwala']
['Mashood Rastgar']
['Tooba Mukhtar']
['Dr. Tafseer Ahmed']
['Barkat Khan']
['Muhammad Naufil']
['Shoaib Zafar']
['Danish Haroon']
['Syed Danyal Khaliq']
['Baqar Abbas Jafri']
['Rahim Rasool']
['Ali Raza Bhayani']
['Asif Kamboh']
['Amber Nadeem']
