# Lec 11 - Web Scrapping with Beautiful Soup

In [None]:
#import the beautiful soup library
from bs4 import BeautifulSoup

#import library to read a website

import urllib


## Read a webpage

In [None]:
python_wiki_page = 'https://en.wikipedia.org/wiki/Python_(programming_language)'

python_wiki_page_html = urllib.request.urlopen(python_wiki_page)

In [None]:
type(python_wiki_page_html)

In [None]:
wiki_soup = BeautifulSoup(python_wiki_page_html,'html.parser')

In [None]:
type(wiki_soup)

In [None]:
wiki_soup

## Read a simple HTML we created 

In [None]:
simple_html = """

<html>

<body>

<h1>Heading 1</h1>

<b><! *** comment 1 ***></b>

<p title = "Title 1" class = 'test'> Paragraph 1</p>

<p title = "Title 2" class = 'test'> Paragraph 2</p>

<div class = "Programming Languages">

<h2> C++ </h2>

<h2> Python </h2>

</div>

</body>

</html>

"""

In [None]:
soup_basic= BeautifulSoup(simple_html,'html.parser')

In [None]:
type(soup_basic)

In [None]:
print(soup_basic)

### Let us explore the tags 

In [None]:
soup_basic.p

In [None]:
tag = soup_basic.p

In [None]:
type(tag)

In [None]:
tag.attrs

In [None]:
tag.string

In [None]:
comment = soup_basic.b

In [None]:
type(comment)

In [None]:
comment

In [None]:
comment.attrs

In [None]:
comment.string

# Different ways of searching elements

In [None]:
from pathlib import Path

static_html_path = Path('OrgStructure.html')

In [None]:
type(static_html_path)

In [None]:
file = open(static_html_path,'r')

In [None]:
org_soup = BeautifulSoup(file.read(),'html.parser')

In [None]:
org_soup.contents

In [None]:
tag_li = org_soup.find('li')

In [None]:
tag_li

In [None]:
type(tag_li)

In [None]:
tag_li.attrs

In [None]:
all_tag_li = org_soup.find_all('li')

In [None]:
all_tag_li

In [None]:
all_tag_li[0]

In [None]:
find_id = org_soup(id = 'HR')

In [None]:
print(find_id)

In [None]:
type(find_id)

In [None]:
type(find_id[0])

In [None]:
find_id[0].attrs

In [None]:
find_id[0]['id']

In [None]:
tag = find_id[0]

In [None]:
tag

In [None]:
tag.li.div.string

In [None]:
tag.find('div').string

In [None]:
tag.li.find_all('div')

In [None]:
ls = org_soup.find_all(text=['Kelly','Jack'])

In [None]:
type(ls[0])

In [None]:
org_soup.find(attrs = {'class':'ITManager'} )

In [None]:
def is_account_manager(tag):
    return tag.has_attr('id') and tag.get('id') == 'Finance'

ac = (org_soup.find(is_account_manager))

In [None]:
type(ac)

In [None]:
ac

In [None]:
ac.attrs

In [None]:
print(*ac.children)

In [None]:
ac.li.div.string

In [None]:
ac.li.find_all('div')

In [None]:
ac.find_all('li')

In [None]:
it_org = org_soup.find(id = 'IT')

In [None]:
it_org

In [None]:
it_sibling = it_org.find_next_sibling()

In [None]:
it_sibling

In [None]:
it_parent = it_org.find_parent()

In [None]:
it_parent

# Navigating the tree

In [None]:
org_soup

In [None]:
org_soup.body


In [None]:
org_soup.ul

In [None]:
org_soup.descendants

In [None]:
for desc in org_soup.ul:
    print(desc)

In [None]:
org_soup.li

In [None]:
for desc in org_soup.li:
    print(desc)

In [None]:
for stripped_strings in org_soup.stripped_strings:
    print(stripped_strings)

In [None]:
for desc in org_soup.descendants:
    print(desc)

In [None]:
org_soup.li

In [None]:
org_soup.li.parent

# Modifying the tree

In [None]:
employee_html = """

<employees>

    <employee class = "accountant">
    
        <firstname>Jack</firstname>
        <lastname>Roger</lastname>
    
    </employee>


 <employee class = "manager">
    
        <firstname>Ryan</firstname>
        <lastname>Marshal</lastname>
    
    </employee>
</employees>

"""

In [None]:
emp_soup= BeautifulSoup(employee_html,'html.parser')

In [None]:
emp_soup

In [None]:
emp_soup.employee

In [None]:
emp_soup.employee['class'] = "finance"

In [None]:
tag  = emp_soup.new_tag('level')
tag.string = '7'

In [None]:
emp_soup.employee.insert_after(tag)

In [None]:
emp_soup

In [None]:
tag.clear()

In [None]:
emp_soup

# Parsing a part of the document

In [None]:
#import the beautiful soup library
from bs4 import BeautifulSoup

#import library to read a website

import urllib

In [None]:
from bs4 import SoupStrainer

In [None]:
python_wiki_page = 'https://en.wikipedia.org/wiki/Python_(programming_language)'

python_wiki_page_html = urllib.request.urlopen(python_wiki_page)

In [None]:
python_wiki_page_html.peek()

In [None]:
from bs4 import SoupStrainer

interest_tags = SoupStrainer(id="History")

In [None]:
history_soup = (BeautifulSoup(python_wiki_page_html.read(),'html.parser',parse_only=interest_tags))

In [None]:
history_soup.contents

# Formating the content

In [None]:
wiki_soup.contents

In [None]:
print(wiki_soup.prettify())

In [None]:
wiki_soup.original_encoding

In [None]:
def upper_case(strtext):
    return strtext.upper()

wiki_soup.prettify(formatter=upper_case)