**WEB SCRAPING IN PYTHON**

In [1]:
import requests

In [4]:
import bs4

In [5]:
result = requests.get("http://www.example.com")

In [7]:
type(result)

requests.models.Response

In [8]:
result.text

'<!doctype html><html lang="en"><head><title>Example Domain</title><meta name="viewport" content="width=device-width, initial-scale=1"><style>body{background:#eee;width:60vw;margin:15vh auto;font-family:system-ui,sans-serif}h1{font-size:1.5em}div{opacity:0.8}a:link,a:visited{color:#348}</style><body><div><h1>Example Domain</h1><p>This domain is for use in documentation examples without needing permission. Avoid use in operations.<p><a href="https://iana.org/domains/example">Learn more</a></div></body></html>\n'

In [9]:
import bs4

In [10]:
soup = bs4.BeautifulSoup(result.text, "lxml")

In [11]:
soup

<!DOCTYPE html>
<html lang="en"><head><title>Example Domain</title><meta content="width=device-width, initial-scale=1" name="viewport"/><style>body{background:#eee;width:60vw;margin:15vh auto;font-family:system-ui,sans-serif}h1{font-size:1.5em}div{opacity:0.8}a:link,a:visited{color:#348}</style></head><body><div><h1>Example Domain</h1><p>This domain is for use in documentation examples without needing permission. Avoid use in operations.</p><p><a href="https://iana.org/domains/example">Learn more</a></p></div></body></html>

In [12]:
soup.select('title')

[<title>Example Domain</title>]

In [13]:
soup.select('p')

[<p>This domain is for use in documentation examples without needing permission. Avoid use in operations.</p>,
 <p><a href="https://iana.org/domains/example">Learn more</a></p>]

In [14]:
soup.select('h1')

[<h1>Example Domain</h1>]

In [15]:
soup.select('title')[0].getText()

'Example Domain'

In [16]:
site_paragraphs = soup.select("p")

In [17]:
site_paragraphs[0].getText()

'This domain is for use in documentation examples without needing permission. Avoid use in operations.'

In [18]:
res = requests.get('https://en.wikipedia.org/wiki/Grace_Hopper')

In [20]:
soup = bs4.BeautifulSoup(res.text, "lxml")

In [21]:
soup

<html><body><p>Please set a user-agent and respect our robot policy https://w.wiki/4wJS. See also https://phabricator.wikimedia.org/T400119.
</p></body></html>

In [22]:
soup.select('.toctext')

[]

In [24]:
res = requests.get('https://en.wikipedia.org/wiki/Grace_Hopper')
soup = bs4.BeautifulSoup(res.text,"lxml")
soup.select(".vector-toc-text")

[]

In [25]:
for item in soup.select(".vector-toc-text"):
    print(item.text)

In [26]:
res = requests.get("https://en.wikipedia.org/wiki/Deep_Blue_(chess_computer)")

In [27]:
soup = bs4.BeautifulSoup(res.text, 'lxml')

In [28]:
soup

<html><body><p>Please set a user-agent and respect our robot policy https://w.wiki/4wJS. See also https://phabricator.wikimedia.org/T400119.
</p></body></html>

In [31]:
soup.select('img')

[]

In [35]:
headers = {"User-Agent": "CrestTraining/1.0"}

res = requests.get("https://en.wikipedia.org/wiki/Deep_Blue_(chess_computer)", headers= headers)
soup = bs4.BeautifulSoup(res.text,'lxml')
image_info = soup.select('.mw-file-element')
image_info

[<img alt="This is a good article. Click here for more information." class="mw-file-element" data-file-height="185" data-file-width="180" decoding="async" height="20" src="//upload.wikimedia.org/wikipedia/en/thumb/9/94/Symbol_support_vote.svg/20px-Symbol_support_vote.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/94/Symbol_support_vote.svg/40px-Symbol_support_vote.svg.png 1.5x" width="19"/>,
 <img class="mw-file-element" data-file-height="601" data-file-width="400" decoding="async" height="376" src="//upload.wikimedia.org/wikipedia/commons/thumb/b/be/Deep_Blue.jpg/250px-Deep_Blue.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/b/be/Deep_Blue.jpg/400px-Deep_Blue.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/b/be/Deep_Blue.jpg 2x" width="250"/>,
 <img class="mw-file-element" data-file-height="64" data-file-width="64" decoding="async" height="150" src="//upload.wikimedia.org/wikipedia/commons/thumb/5/52/Chess_Programming.svg/250px-Chess_Programming.svg.png" 

In [36]:
print(len(image_info))
computer = image_info[1]
type(computer)

11


bs4.element.Tag

In [37]:
computer['src']

'//upload.wikimedia.org/wikipedia/commons/thumb/b/be/Deep_Blue.jpg/250px-Deep_Blue.jpg'

In [38]:
image_link = requests.get('https://upload.wikimedia.org/wikipedia/commons/thumb/b/be/Deep_Blue.jpg/250px-Deep_Blue.jpg')
#image_link.content

**Working with Multiple Pages and Items**

In [39]:
base_url = 'http://books.toscrape.com/catalogue/page-{}.html'

In [40]:
pg_num = 12
base_url.format(pg_num)

'http://books.toscrape.com/catalogue/page-12.html'

In [41]:
res = requests.get(base_url.format('1'))
soup = bs4.BeautifulSoup(res.text,"lxml")
soup.select(".product_pod")

[<article class="product_pod">
 <div class="image_container">
 <a href="a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
 </div>
 <p class="star-rating Three">
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 <i class="icon-star"></i>
 </p>
 <h3><a href="a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
 <div class="product_price">
 <p class="price_color">Â£51.77</p>
 <p class="instock availability">
 <i class="icon-ok"></i>
     
         In stock
     
 </p>
 <form>
 <button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
 </form>
 </div>
 </article>,
 <article class="product_pod">
 <div class="image_container">
 <a href="tipping-the-velvet_999/index.html"><img alt="Tipping the Velvet" class="thumbnail" src="../media/cach

In [42]:
products = soup.select(".product_pod")
example = products[0]
type(example)

bs4.element.Tag