<a href="https://colab.research.google.com/github/niladribanerjee80/python-udemy-jose/blob/main/Section_15_Jose_Web_Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from bs4 import BeautifulSoup
import requests
import csv

In [None]:
# Grab a Title
result = requests.get("https://en.wikipedia.org/wiki/Jonas_Salk").text
soup = BeautifulSoup(result, "lxml")

print(soup.select('title'))
print(soup.select('title')[0].getText())

[<title>Jonas Salk - Wikipedia</title>]
Jonas Salk - Wikipedia


In [None]:
site_paragraphs = soup.select('p')

paras = []
for paragraph in site_paragraphs:
    if paragraph.getText() == '\n':
        continue
    else:
        paras.append(paragraph.getText())

| Operator        				| Associativity |
| -------------   				| ------------- |
| soup.select('div')     		| all elements with 'div' tag |
| soup.select('#some_id')       | elements containing id = 'some_id' |
| soup.select('.some_class')    | elements containing class = 'some_class' |
| soup.select('div span')       | any elements named span within a div element |
| soup.select('div > span')     | any elements named span directly within a div element, with nothing 									in between |

In [None]:
# get the contents for Neil Armstrong wiki

result = requests.get("https://en.wikipedia.org/wiki/Neil_Armstrong").text
soup = BeautifulSoup(result, "lxml")

list_of_items = [i for i in soup.select('.vector-toc-text')]

for item in list_of_items:
    # find the span containing the text
    text_span = item.find('span', class_='')

    # print the text if available
    if text_span:
        print(text_span.text)
    else:
        continue

Early life and education
Navy service
College years
Test pilot
Astronaut career
Gemini program
Gemini 5
Gemini 8
Gemini 11
Apollo program
Apollo 11
Voyage to the Moon
First Moon walk
Return to Earth
Life after Apollo
Teaching
NASA commissions
Business activities
North Pole expedition
Public profile
Personal life
Illness and death
Legacy
See also
Notes
References
Further reading
External links


In [None]:
 # working with images

In [None]:
result = requests.get("https://en.wikipedia.org/wiki/Deep_Blue_(chess_computer)").text

soup = BeautifulSoup(result, "lxml")

# <img src="//upload.wikimedia.org/wikipedia/commons/thumb/b/be/Deep_Blue.jpg/220px-Deep_Blue.jpg"/>


<img src="//upload.wikimedia.org/wikipedia/commons/thumb/b/be/Deep_Blue.jpg/220px-Deep_Blue.jpg"/>

In [None]:
result = requests.get("https://en.wikipedia.org/wiki/Deep_Blue_(chess_computer)").text
soup = BeautifulSoup(result, "lxml")

# list all image tags and download them
result_images = soup.select(".mw-file-element")

i = 0

for image in result_images:

    # increment the counter
    i += 1

    # get the source
    source = image['src']

    # prefix it with https:
    source = "https:" + source

    # get the image downloaded as binary
    image_binary = requests.get(source)

    # get the image file extension
    ext = source[-4:]

    # print the new filename
    print(f"Image URL : {source}, downloaded into file_{i}{ext}")
    print('\n')

    # open a file
    f = open(f"file_{i}{ext}","wb")

    # write the file
    f.write(image_binary.content)

    # close the file
    f.close()

Image URL : https://upload.wikimedia.org/wikipedia/en/thumb/9/94/Symbol_support_vote.svg/19px-Symbol_support_vote.svg.png, downloaded into file_1.png


Image URL : https://upload.wikimedia.org/wikipedia/commons/thumb/b/be/Deep_Blue.jpg/220px-Deep_Blue.jpg, downloaded into file_2.jpg


Image URL : https://upload.wikimedia.org/wikipedia/commons/thumb/5/52/Chess_Programming.svg/150px-Chess_Programming.svg.png, downloaded into file_3.png


Image URL : https://upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Kasparov_Magath_1985_Hamburg-2.png/220px-Kasparov_Magath_1985_Hamburg-2.png, downloaded into file_4.png


Image URL : https://upload.wikimedia.org/wikipedia/commons/thumb/8/83/One_of_Deep_Blue%27s_processors_%282586060990%29.jpg/220px-One_of_Deep_Blue%27s_processors_%282586060990%29.jpg, downloaded into file_5.jpg


Image URL : https://upload.wikimedia.org/wikipedia/commons/thumb/0/05/Chess.svg/28px-Chess.svg.png, downloaded into file_6.png


Image URL : https://upload.wikimedia.org/wi

In [None]:
rm file*

## Web scraping exercise
* www.toscrape.com | Books - but this site is not available
* We used : https://webscraper.io/test-sites/e-commerce/allinone/computers/laptops

In [21]:
import requests
from bs4 import BeautifulSoup

def extract_product_data(url):

    # make a GET request to the url
    response = requests.get(url)

    # parse the html content using beautifulsoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all product wrappers
    product_wrappers = soup.find_all('div', class_='product-wrapper card-body')

    # Create a list to store extracted product data
    products = []

    for wrapper in product_wrappers:
        # Extract product data
        title = wrapper.find('a', class_='title').text.strip()
        price = wrapper.find('h4', class_="price float-end card-title pull-right").text.strip()
        description = wrapper.find('p', class_="description card-text").text.strip()
        rating = wrapper.find('p', attrs={'data-rating': True})['data-rating']

        # Create a dictionary to store product information
        product = {
            'title': title,
            'price': price,
            'description': description,
            'rating': rating
        }

        # Append the product to the list
        products.append(product)

    return products


In [22]:
products = extract_product_data("https://webscraper.io/test-sites/e-commerce/allinone/computers/laptops")

In [24]:
products

[{'title': 'Asus VivoBook...',
  'price': '$295.99',
  'description': 'Asus VivoBook X441NA-GA190 Chocolate Black, 14", Celeron N3450, 4GB, 128GB SSD, Endless OS, ENG kbd',
  'rating': '3'},
 {'title': 'Prestigio Smar...',
  'price': '$299',
  'description': 'Prestigio SmartBook 133S Dark Grey, 13.3" FHD IPS, Celeron N3350 1.1GHz, 4GB, 32GB, Windows 10 Pro + Office 365 1 gadam',
  'rating': '2'},
 {'title': 'Prestigio Smar...',
  'price': '$299',
  'description': 'Prestigio SmartBook 133S Gold, 13.3" FHD IPS, Celeron N3350 1.1GHz, 4GB, 32GB, Windows 10 Pro + Office 365 1 gadam',
  'rating': '4'},
 {'title': 'Aspire E1-510',
  'price': '$306.99',
  'description': '15.6", Pentium N3520 2.16GHz, 4GB, 500GB, Linux',
  'rating': '3'},
 {'title': 'Lenovo V110-15...',
  'price': '$321.94',
  'description': 'Lenovo V110-15IAP, 15.6" HD, Celeron N3350 1.1GHz, 4GB, 128GB SSD, Windows 10 Home',
  'rating': '3'},
 {'title': 'Lenovo V110-15...',
  'price': '$356.49',
  'description': 'Asus VivoBook