# Data scraping using beautiful soup
1. import bs4
2. make a get request to fetch page data
3. parse html
4. filter relvant parts

In [1]:
!pip3 install bs4



In [2]:
import bs4

## Scraping table form wikipedia 
#### Using library urllib

In [3]:
from urllib.request import urlopen

### In this we are fetching data from wikipedia and the topic is Android version history

In [4]:
android_url = "https://en.wikipedia.org/wiki/Android_version_history"

## Making a get request

In [5]:
android_data = urlopen(android_url)
print(type(android_data))

<class 'http.client.HTTPResponse'>


In [6]:
android_html = android_data.read()
print(android_html)
android_data.close()



### Parsing data

In [7]:
from bs4 import BeautifulSoup as bs

In [8]:
android_soup = bs(android_html,'html.parser')
print(android_soup)

<!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Android version history - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"88b10536-7230-4254-8430-f1090f467397","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Android_version_history","wgTitle":"Android version history","wgCurRevisionId":1025339267,"wgRevisionId":1025339267,"wgArticleId":30752816,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description is different from Wikidata","Use mdy dates from March 2021","All articles with unso

In [9]:
android_soup.findAll('table')

[<table class="wikitable">
 <tbody><tr>
 <th>Name
 </th>
 <th>Version number(s)
 </th>
 <th>Initial stable<br/>release date
 </th>
 <th>Supported<br/>(security fixes)
 </th>
 <th>API level
 </th>
 <th>References
 </th></tr>
 <tr>
 <td rowspan="2">No official codename
 </td>
 <td>1.0
 </td>
 <td>September 23, 2008<span class="noprint">; 12 years ago</span><span style="display:none"> (<span class="bday dtstart published updated">2008-09-23</span>)</span>
 </td>
 <td class="table-no" style="background:#F99;vertical-align:middle;text-align:center;">No
 </td>
 <td>1
 </td>
 <td><sup class="reference" id="cite_ref-unofficial_and_official_codenames_9-1"><a href="#cite_note-unofficial_and_official_codenames-9">[9]</a></sup><sup class="reference" id="cite_ref-:0_14-0"><a href="#cite_note-:0-14">[14]</a></sup>
 </td></tr>
 <tr>
 <td>1.1
 </td>
 <td>February 9, 2009<span class="noprint">; 12 years ago</span><span style="display:none"> (<span class="bday dtstart published updated">2009-02-09</span

In [10]:
tables = android_soup.findAll('table',{'class':'wikitable'})

In [11]:
len(tables)

32

In [12]:
android_table = tables[0]
print(android_table)

<table class="wikitable">
<tbody><tr>
<th>Name
</th>
<th>Version number(s)
</th>
<th>Initial stable<br/>release date
</th>
<th>Supported<br/>(security fixes)
</th>
<th>API level
</th>
<th>References
</th></tr>
<tr>
<td rowspan="2">No official codename
</td>
<td>1.0
</td>
<td>September 23, 2008<span class="noprint">; 12 years ago</span><span style="display:none"> (<span class="bday dtstart published updated">2008-09-23</span>)</span>
</td>
<td class="table-no" style="background:#F99;vertical-align:middle;text-align:center;">No
</td>
<td>1
</td>
<td><sup class="reference" id="cite_ref-unofficial_and_official_codenames_9-1"><a href="#cite_note-unofficial_and_official_codenames-9">[9]</a></sup><sup class="reference" id="cite_ref-:0_14-0"><a href="#cite_note-:0-14">[14]</a></sup>
</td></tr>
<tr>
<td>1.1
</td>
<td>February 9, 2009<span class="noprint">; 12 years ago</span><span style="display:none"> (<span class="bday dtstart published updated">2009-02-09</span>)</span>
</td>
<td class="tabl

## Extracting Useful Information
- Remove undesired tags
- Extract table header and data

In [13]:
headers = android_table.findAll('th')[:-1]

In [14]:
len(headers)

5

### Extract data from table header

In [15]:
column_name  = [cN.text[:-1] for cN in headers]

In [16]:
column_name

['Name',
 'Version number(s)',
 'Initial stablerelease date',
 'Supported(security fixes)',
 'API level']

In [17]:
data = android_table.findAll('tr')[1:]
print(data)

[<tr>
<td rowspan="2">No official codename
</td>
<td>1.0
</td>
<td>September 23, 2008<span class="noprint">; 12 years ago</span><span style="display:none"> (<span class="bday dtstart published updated">2008-09-23</span>)</span>
</td>
<td class="table-no" style="background:#F99;vertical-align:middle;text-align:center;">No
</td>
<td>1
</td>
<td><sup class="reference" id="cite_ref-unofficial_and_official_codenames_9-1"><a href="#cite_note-unofficial_and_official_codenames-9">[9]</a></sup><sup class="reference" id="cite_ref-:0_14-0"><a href="#cite_note-:0-14">[14]</a></sup>
</td></tr>, <tr>
<td>1.1
</td>
<td>February 9, 2009<span class="noprint">; 12 years ago</span><span style="display:none"> (<span class="bday dtstart published updated">2009-02-09</span>)</span>
</td>
<td class="table-no" style="background:#F99;vertical-align:middle;text-align:center;">No
</td>
<td>2
</td>
<td><sup class="reference" id="cite_ref-unofficial_and_official_codenames_9-2"><a href="#cite_note-unofficial_and_of

### Removing rowspan

In [18]:
# removing rowspan
# updated_list=[]
# for row in data:
#     row = row.findAll('td')[:-1]
#     if(len(row)<len(column_name)):
#         l = []
#         l.append(st)
#         for i in row:
#             l.append(i)
#         row=l
#     else:
#         st = row[0]
# Append this code in the below code

In [19]:
table_rows = []
for row in data:
    current_row = []
    row_data = row.findAll('td')[:-1]
    if(len(row_data)<len(column_name)):
        l = []
        l.append(st)
        for i in row_data:
            l.append(i)
        row_data=l
    else:
        st = row_data[0]
#     row_data = row.findAll('td')[:-1]

    for idx,current_data in enumerate(row_data):
        if(idx == 1):
            current_row.append(current_data.text[:-1].split(": ")[-1])
#         elif idx ==2 or idx==1:
#             info = current_data.text
        else:
            current_row.append(current_data.text[:-1])
    table_rows.append(current_row)
    
    

In [20]:
table_rows

[['No official codename',
  '1.0',
  'September\xa023, 2008; 12 years ago\xa0(2008-09-23)',
  'No',
  '1'],
 ['No official codename',
  '1.1',
  'February\xa09, 2009; 12 years ago\xa0(2009-02-09)',
  'No',
  '2'],
 ['Cupcake',
  '1.5',
  'April\xa027, 2009; 12 years ago\xa0(2009-04-27)',
  'No',
  '3'],
 ['Donut',
  '1.6',
  'September\xa015, 2009; 11 years ago\xa0(2009-09-15)',
  'No',
  '4'],
 ['Eclair',
  '2.0',
  'October\xa027, 2009; 11 years ago\xa0(2009-10-27)',
  'No',
  '5'],
 ['Eclair',
  '2.0.1',
  'December\xa03, 2009; 11 years ago\xa0(2009-12-03)',
  'No',
  '6'],
 ['Eclair',
  '2.1',
  'January\xa011, 2010; 11 years ago\xa0(2010-01-11)',
  'No',
  '7'],
 ['Froyo',
  '2.2 – 2.2.3',
  'May\xa020, 2010; 11 years ago\xa0(2010-05-20)',
  'No',
  '8'],
 ['Gingerbread',
  '2.3 – 2.3.2',
  'December\xa06, 2010; 10 years ago\xa0(2010-12-06)',
  'No',
  '9'],
 ['Gingerbread',
  '2.3.3 - 2.3.7',
  'February\xa09, 2011; 10 years ago\xa0(2011-02-09)',
  'No',
  '10'],
 ['Honeycomb',
 

### Writing and reading csv files

In [21]:
file_name  = 'android_version_history.csv'
with open(file_name,'w',encoding='utf-8')as f:
#     write the header
    header_string = ','.join(column_name)
    header_string+='\n'
    f.write(header_string)
    for i in table_rows:
        row_string = ''
        for w in i:
            w =w.replace(',',"")
            row_string+=w+','
        row_string=row_string[:-1]
        row_string +='\n'
        f.write(row_string)
        

### Cleaning the data of CSV file

In [22]:
import pandas as pd

In [23]:
dataFrame= pd.read_csv('android_version_history.csv')

In [26]:
dataFrame.head(20)

Unnamed: 0,Name,Version number(s),Initial stablerelease date,Supported(security fixes),API level
0,No official codename,1.0,September 23 2008; 12 years ago (2008-09-23),No,1
1,No official codename,1.1,February 9 2009; 12 years ago (2009-02-09),No,2
2,Cupcake,1.5,April 27 2009; 12 years ago (2009-04-27),No,3
3,Donut,1.6,September 15 2009; 11 years ago (2009-09-15),No,4
4,Eclair,2.0,October 27 2009; 11 years ago (2009-10-27),No,5
5,Eclair,2.0.1,December 3 2009; 11 years ago (2009-12-03),No,6
6,Eclair,2.1,January 11 2010; 11 years ago (2010-01-11),No,7
7,Froyo,2.2 – 2.2.3,May 20 2010; 11 years ago (2010-05-20),No,8
8,Gingerbread,2.3 – 2.3.2,December 6 2010; 10 years ago (2010-12-06),No,9
9,Gingerbread,2.3.3 - 2.3.7,February 9 2011; 10 years ago (2011-02-09),No,10


In [25]:
# iloc is used to access the row and column of data
dataFrame.iloc[2][1]

'1.5'

# Web scraping image using Beautiful soup

In [27]:
!pip install html5lib



In [33]:
import bs4
import requests
import html5lib

In [30]:
url ="https://www.passiton.com/inspirational-quotes"

In [31]:
r = requests.get(url)

In [32]:
r.content

b'<!DOCTYPE html>\n<html class="no-js" dir="ltr" lang="en-US">\n    <head>\n        <title>Inspirational Quotes - Motivational Quotes - Leadership Quotes | PassItOn.com</title>\n        <meta charset="utf-8">\n        <meta http-equiv="content-type" content="text/html; charset=utf-8" />\n        <meta http-equiv="X-UA-Compatible" content="IE=edge" />\n        <meta name="viewport" content="width=device-width,initial-scale=1.0" />\n        <meta name="description" content="The Foundation for a Better Life | Pass It On.com">\n        <link rel="apple-touch-icon" sizes="180x180" href="/apple-touch-icon.png">\n        <link rel="icon" type="image/png" sizes="32x32" href="/favicon-32x32.png">\n        <link rel="icon" type="image/png" sizes="16x16" href="/favicon-16x16.png">\n        <link rel="manifest" href="/site.webmanifest">\n        <link rel="mask-icon" href="/safari-pinned-tab.svg" color="#c8102e">\n        <meta name="msapplication-TileColor" content="#c8102e">\n        <meta name=

In [34]:
soup = bs4.BeautifulSoup(r.content,'html.parser')

In [35]:
soup

<!DOCTYPE html>

<html class="no-js" dir="ltr" lang="en-US">
<head>
<title>Inspirational Quotes - Motivational Quotes - Leadership Quotes | PassItOn.com</title>
<meta charset="utf-8"/>
<meta content="text/html; charset=utf-8" http-equiv="content-type">
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width,initial-scale=1.0" name="viewport"/>
<meta content="The Foundation for a Better Life | Pass It On.com" name="description"/>
<link href="/apple-touch-icon.png" rel="apple-touch-icon" sizes="180x180"/>
<link href="/favicon-32x32.png" rel="icon" sizes="32x32" type="image/png"/>
<link href="/favicon-16x16.png" rel="icon" sizes="16x16" type="image/png"/>
<link href="/site.webmanifest" rel="manifest"/>
<link color="#c8102e" href="/safari-pinned-tab.svg" rel="mask-icon"/>
<meta content="#c8102e" name="msapplication-TileColor"/>
<meta content="#ffffff" name="theme-color"/>
<link crossorigin="anonymous" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1

In [101]:
img_element=soup.findAll("img")

In [102]:
image =img_element[0]

In [103]:
image

<img alt="Pass It On" class="logo-dark" data-rjs="/assets/site/logo@2x-77b9f4fa1e4bafa6ff119f8f162b7bd9.png" src="/assets/site/logo-6d680decaadef58e4fbb586e147bc135.png"/>

In [104]:
img_url =image.attrs['src']

In [81]:
r=requests.get(img_url)

In [82]:
r.content

b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00\x96\x00\x96\x00\x00\xff\xed\x00TPhotoshop 3.0\x008BIM\x04\x04\x00\x00\x00\x00\x00\x1c\x1c\x01Z\x00\x03\x1b%G\x1c\x02\x00\x00\x02\x00\x04\x1c\x02\x05\x00\x08template8BIM\x04%\x00\x00\x00\x00\x00\x10h\xdaW+b\x1d5\xf3\x8c\xab&L\no\xc6]\xff\xe1\x00\xa0Exif\x00\x00MM\x00*\x00\x00\x00\x08\x00\x05\x01\x1a\x00\x05\x00\x00\x00\x01\x00\x00\x00J\x01\x1b\x00\x05\x00\x00\x00\x01\x00\x00\x00R\x01(\x00\x03\x00\x00\x00\x01\x00\x02\x00\x00\x012\x00\x02\x00\x00\x00\x14\x00\x00\x00Z\x87i\x00\x04\x00\x00\x00\x01\x00\x00\x00n\x00\x00\x00\x00\x00\x00\x00\x96\x00\x00\x00\x01\x00\x00\x00\x96\x00\x00\x00\x012021:05:21 10:54:27\x00\x00\x03\xa0\x01\x00\x03\x00\x00\x00\x01\x00\x01\x00\x00\xa0\x02\x00\x03\x00\x00\x00\x01\x03\xe8\x00\x00\xa0\x03\x00\x03\x00\x00\x00\x01\x03\xe8\x00\x00\x00\x00\x00\x00\xff\xe2\x02dICC_PROFILE\x00\x01\x01\x00\x00\x02Tlcms\x040\x00\x00mntrRGB XYZ \x07\xe5\x00\x05\x00\x15\x00\x10\x00!\x00*acspAPPL\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00

In [83]:
with open("pass_it_on_2.jpg","wb") as f:
    f.write(r.content)

# iterating over all element

In [105]:
img_element=img_element
for img in img_element:
    img_url = img.attrs['src']
    try:
        r =requests.get(img_url)
    except Exception:
        continue
    name=''
    for idx,element in enumerate(img_url):
        if element == '?':
            i=idx
    for element in img_url[i+1:]:
        name+=element
    name+=".jpg"
    with open(name,"wb") as f:
        f.write(r.content)