# Data scraping using beautiful soup
- import beautiful soup
- make a GET request to fetch the page data
- parse HTML
- filter relevant part

In [None]:
!pip install bs4

In [None]:
from urllib.request import urlopen

In [None]:
android_url = "https://en.wikipedia.org/wiki/Android_version_history"

In [None]:
android_data = urlopen(android_url)
print(type(android_data))

In [None]:
android_html = android_data.read()
print(android_html)
android_data.close() # close the network after data has been fetched

## Parsing data

In [None]:
from bs4 import BeautifulSoup as soup

In [None]:
android_soup = soup(android_html, 'html.parser')

In [None]:
print(android_soup)

In [None]:
print(type(android_soup))

In [None]:
tables = android_soup.findAll('table', {'class':'wikitable'})
print(len(tables))

In [None]:
android_table = tables[0]
print(android_table)

## Extracting useful information
- remove undesired tags
- extract table header and data

In [None]:
headers = android_table.findAll('th')
print(headers)

In [None]:
column_titles = [ct.text[:-1] for ct in headers]
print(column_titles)

In [None]:
rows_data = android_table.findAll('tr')[1:]
# slicing is done because the first row is th which has already been extracted

In [None]:
first_row = rows_data[0].findAll('td')

In [None]:
for d in first_row:
    print(d.text)

In [None]:
table_rows = [] #store all rows 
for row in rows_data:
    current_row = []
    row_data = row.findAll('td')
    for i,d in enumerate(row_data):
        # remove commas from the date
        if i==2:
            info = d.text
            info = info.replace(',','')
            current_row.append(info[:-1])
        else:
            current_row.append(d.text[:-1])
    table_rows.append(current_row)
print(table_rows)

## Writing and reading a CSV file

In [None]:
filename = 'android_version_history.csv'
with open(filename, 'w', encoding='utf-8') as f:
    # write the header
    header_string = ','.join(column_titles)
    print(header_string)
    header_string += '\n'
    f.write(header_string)
    
    # add row data
    for row in table_rows:
        row_string = ','.join(row)
        row_string += '\n'
        f.write(row_string)

## Data Cleaning
- remove undesired commas and symbols
- undesired information

In [1]:
import pandas as pd

In [4]:
df = pd.read_csv('android_version_history.csv')

In [None]:
df.head(n=18)

## Loading local files

In [None]:
# just open the file with open instead of making a GET request
with open('android.html', encoding='utf-8') as f:
    page_soup = soup(f, 'html.parser')

In [None]:
page_soup.findAll('h1') #first heading

# Image Scraping
https://www.passiton.com/inspirational-quotes

In [1]:
!pip install html5lib



In [4]:
import bs4
import requests

In [5]:
url = "https://www.passiton.com/inspirational-quotes"

In [6]:
response = requests.get(url)

In [25]:
print(response.content)

b'<!DOCTYPE html>\n<html class="no-js" dir="ltr" lang="en-US">\n    <head>\n        <title>Inspirational Quotes - Motivational Quotes - Leadership Quotes | PassItOn.com</title>\n        <meta charset="utf-8">\n        <meta http-equiv="content-type" content="text/html; charset=utf-8" />\n        <meta http-equiv="X-UA-Compatible" content="IE=edge" />\n        <meta name="viewport" content="width=device-width,initial-scale=1.0,maximum-scale=1" />\n        <meta name="description" content="The Foundation for a Better Life | Pass It On.com">\n        <link rel="apple-touch-icon" sizes="180x180" href="/apple-touch-icon.png">\n        <link rel="icon" type="image/png" sizes="32x32" href="/favicon-32x32.png">\n        <link rel="icon" type="image/png" sizes="16x16" href="/favicon-16x16.png">\n        <link rel="manifest" href="/site.webmanifest">\n        <link rel="mask-icon" href="/safari-pinned-tab.svg" color="#c8102e">\n        <meta name="msapplication-TileColor" content="#c8102e">\n   

In [11]:
soup = bs4.BeautifulSoup(response.content)

In [31]:
images = soup.findAll('img')
images = images[2:]
images = images[:-2]

In [32]:
print(images)

[<img alt="Good character is more to be praised than outstanding talent. Most talents are to some extent a gift. Good character, by contrast, is not given to us. We have to build it piece by piece by thought, choice, courage and determination. #&lt;Author:0x00007f60a116e280&gt;" class="margin-10px-bottom shadow" height="310" src="https://assets.passiton.com/quotes/quote_artwork/7364/medium/20200721_tuesday_quote.jpg?1595010959" width="310"/>, <img alt="People grow through experience if they meet life honestly and courageously. This is how character is built. #&lt;Author:0x00007f60a117d848&gt;" class="margin-10px-bottom shadow" height="310" src="https://assets.passiton.com/quotes/quote_artwork/4393/medium/20200720_monday_quote.jpg?1595010931" width="310"/>, <img alt="A life lived with integrity… is a shining star in whose light others may follow in the years to come. #&lt;Author:0x00007f60a1192bd0&gt;" class="margin-10px-bottom shadow" height="310" src="https://assets.passiton.com/quote

In [33]:
# now we have all the img tags we need
# need to extract the src tag's value
# soup creates a dictionary of all the attributes called attrs
images[0].attrs['src']

'https://assets.passiton.com/quotes/quote_artwork/7364/medium/20200721_tuesday_quote.jpg?1595010959'

In [34]:
# now we need to write the image in a file
with open('inspiration.jpg', 'wb') as file:
    image_url = images[0].attrs['src']
    response = requests.get(image_url)
    file.write(response.content)

In [35]:
# we can iterate over all the elements and download all the files
# for i, article in enumerate(images):
#     with open('inspiration{}.jpg'.format(i), 'wb') as file:
#         image_url = images[i].attrs['src']
#         response = requests.get(image_url)
#         file.write(response.content)