In [16]:
# import beautiful soup
from bs4 import BeautifulSoup
import requests

# most websites have robots.txt to see which endpoints are allowed to scrape
url = 'http://quotes.toscrape.com/'

# using request library to get response from the server
response = requests.get(url)
response

<Response [200]>

In [18]:
# Response 200 - says our request was success
# extracting response as HTML data
html_data = response.text

# creating an object for Beautiful soup
soup = BeautifulSoup(html_data)

# print title 
print(soup.title)

# prettify
print(soup.prettify())

<title>Quotes to Scrape</title>
<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Quotes to Scrape
  </title>
  <link href="/static/bootstrap.min.css" rel="stylesheet"/>
  <link href="/static/main.css" rel="stylesheet"/>
 </head>
 <body>
  <div class="container">
   <div class="row header-box">
    <div class="col-md-8">
     <h1>
      <a href="/" style="text-decoration: none">
       Quotes to Scrape
      </a>
     </h1>
    </div>
    <div class="col-md-4">
     <p>
      <a href="/login">
       Login
      </a>
     </p>
    </div>
   </div>
   <div class="row">
    <div class="col-md-8">
     <div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
      <span class="text" itemprop="text">
       “The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
      </span>
      <span>
       by
       <small class="author" itemprop="author">
        Albert Einstein
       </small

In [20]:
# extracting quotes from div tag
# <div class="quote" 

quot = soup.find('div',class_='quote')
quot

<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
<span>by <small class="author" itemprop="author">Albert Einstein</small>
<a href="/author/Albert-Einstein">(about)</a>
</span>
<div class="tags">
            Tags:
            <meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>
<a class="tag" href="/tag/change/page/1/">change</a>
<a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
<a class="tag" href="/tag/thinking/page/1/">thinking</a>
<a class="tag" href="/tag/world/page/1/">world</a>
</div>
</div>

In [25]:
# <div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
# <span class="text" itemprop="text">

# If you see above output we can see quotes but still some junk. We can clean it using .test
quot = soup.find('span',class_='text').text
quot

'“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”'

In [26]:
# Find all 'a' tags
a_tags = soup.find_all('a')

# Print the URLs 
for link in a_tags:
    print(link.get('href'))

/
/login
/author/Albert-Einstein
/tag/change/page/1/
/tag/deep-thoughts/page/1/
/tag/thinking/page/1/
/tag/world/page/1/
/author/J-K-Rowling
/tag/abilities/page/1/
/tag/choices/page/1/
/author/Albert-Einstein
/tag/inspirational/page/1/
/tag/life/page/1/
/tag/live/page/1/
/tag/miracle/page/1/
/tag/miracles/page/1/
/author/Jane-Austen
/tag/aliteracy/page/1/
/tag/books/page/1/
/tag/classic/page/1/
/tag/humor/page/1/
/author/Marilyn-Monroe
/tag/be-yourself/page/1/
/tag/inspirational/page/1/
/author/Albert-Einstein
/tag/adulthood/page/1/
/tag/success/page/1/
/tag/value/page/1/
/author/Andre-Gide
/tag/life/page/1/
/tag/love/page/1/
/author/Thomas-A-Edison
/tag/edison/page/1/
/tag/failure/page/1/
/tag/inspirational/page/1/
/tag/paraphrased/page/1/
/author/Eleanor-Roosevelt
/tag/misattributed-eleanor-roosevelt/page/1/
/author/Steve-Martin
/tag/humor/page/1/
/tag/obvious/page/1/
/tag/simile/page/1/
/page/2/
/tag/love/
/tag/inspirational/
/tag/life/
/tag/humor/
/tag/books/
/tag/reading/
/tag/fri

* Once we scrape our HTML data from web, we can store it any format we want(list, dictionaries etc), dataframe

In [28]:
# getting all div tags with class quote
all_divs = soup.find_all('div',class_='quote')
all_divs

[<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
 <span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
 <span>by <small class="author" itemprop="author">Albert Einstein</small>
 <a href="/author/Albert-Einstein">(about)</a>
 </span>
 <div class="tags">
             Tags:
             <meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>
 <a class="tag" href="/tag/change/page/1/">change</a>
 <a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
 <a class="tag" href="/tag/thinking/page/1/">thinking</a>
 <a class="tag" href="/tag/world/page/1/">world</a>
 </div>
 </div>,
 <div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
 <span class="text" itemprop="text">“It is our choices, Harry, that show what we truly are, far more than our abilities.”</span>
 <span>by <small class="author" itempr

In [45]:
import pandas as pd

# creating an empty list to store quotes and author name
quotes_list = []

# looping through inside all divs and extract quotes and author
for i in all_divs:
    try:
        quotes = i.find('span',class_='text').text
        author = i.find('small',class_='author').text
        a_tag = i.find('a',class_='tag').text
        quote_dict = {'quotes': quotes,
                      'author': author,
                      'a_tag': a_tag}
        # appending dictionary into our empty list
        quotes_list.append(quote_dict)
    except Exception as e:
        print(e)
        
# creating Dataframe
quotes_df = pd.DataFrame(quotes_list)
quotes_df


Unnamed: 0,quotes,author,a_tag
0,“The world as we have created it is a process ...,Albert Einstein,change
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling,abilities
2,“There are only two ways to live your life. On...,Albert Einstein,inspirational
3,"“The person, be it gentleman or lady, who has ...",Jane Austen,aliteracy
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe,be-yourself
5,“Try not to become a man of success. Rather be...,Albert Einstein,adulthood
6,“It is better to be hated for what you are tha...,André Gide,life
7,"“I have not failed. I've just found 10,000 way...",Thomas A. Edison,edison
8,“A woman is like a tea bag; you never know how...,Eleanor Roosevelt,misattributed-eleanor-roosevelt
9,"“A day without sunshine is like, you know, nig...",Steve Martin,humor


In [37]:
# Yay!! We printed for 1st page. Scarpe link has 10 pages. Now we can loop through to all pages and print entire quotes

pages_list = []
# looping through all 10 pages
for i in range(1,11):
    url = f'http://quotes.toscrape.com/page/{i}'
    # getting sll divs from all pages
    all_divs = soup.find_all('div', class_='quote')
    for j in all_divs:
        try:
            quotes = j.find('span', class_='text').text
            author = j.find('small',class_='author').text
            pages_dict ={'quotes': quotes,
                         'author': author}
            # appending it into list
            pages_list.append(pages_dict)
        except Exception as e:
            print(e)
            
pages_df = pd.DataFrame(pages_list)
pages_df

Unnamed: 0,quotes,author
0,“The world as we have created it is a process ...,Albert Einstein
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling
2,“There are only two ways to live your life. On...,Albert Einstein
3,"“The person, be it gentleman or lady, who has ...",Jane Austen
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe
...,...,...
95,“Try not to become a man of success. Rather be...,Albert Einstein
96,“It is better to be hated for what you are tha...,André Gide
97,"“I have not failed. I've just found 10,000 way...",Thomas A. Edison
98,“A woman is like a tea bag; you never know how...,Eleanor Roosevelt


## Using Splinter- Chrome driver

In [54]:
from splinter import Browser
from webdriver_manager.chrome import ChromeDriverManager
# pip install webdriver_manager

executable_path = {'executable_path':ChromeDriverManager().install()}
browser = Browser('chrome',**executable_path, headless = False)



INFO:WDM:

Current google-chrome version is 98.0.4758
INFO:WDM:Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome
INFO:WDM:Get LATEST chromedriver version for 98.0.4758 google-chrome
Trying to download new driver from https://chromedriver.storage.googleapis.com/98.0.4758.102/chromedriver_win32.zip
INFO:WDM:Trying to download new driver from https://chromedriver.storage.googleapis.com/98.0.4758.102/chromedriver_win32.zip
Driver has been saved in cache [C:\Users\Codergirl\.wdm\drivers\chromedriver\win32\98.0.4758.102]
INFO:WDM:Driver has been saved in cache [C:\Users\Codergirl\.wdm\drivers\chromedriver\win32\98.0.4758.102]


In [None]:
# now new chrome browser opened in my computer

In [50]:
all_quotes = []
url = 'http://quotes.toscrape.com/'
# we are asking our browser to open the url
browser.visit(url)
# Now quotes to scrape url opened in chrome
# looping through all pages and getting quotes an author
for i in range(1,11):
    soup = BeautifulSoup(browser.html,'html.parser')
    # getting all div tags from all pages
    all_divs = soup.find_all('div',class_='quote')
    for j in all_divs:
        try:
            quotes = j.find('span',class_='text').text
            author = j.find('small',class_='author').text
            quotes_dict = {'quotes': quotes,
                           'author': author}
            all_quotes.append(all_quotes)
        except Exception as e:
            print(e)
    # we are programmatically clicking a link from the front page to go to another page by interacting with the browser
    # got the word "Next" from html page to click Next to change pages
    browser.links.find_by_partial_text('Next').click() # find the link and click on it
    

df = pd.DataFrame(all_quotes)
df

ElementDoesNotExist: no elements could be found with link by partial text "Next"

In [60]:
# Since we are calling all pages, it is not loading. So lets check few pages first
# we can use sleep for 1 sec if its not loading completely

import time
all_quotes = []
url = 'http://quotes.toscrape.com/'
browser.visit(url)
for i in range(1,10):
    soup = BeautifulSoup(browser.html,'html.parser')
    all_divs = soup.find_all('div',class_='quote')
    for div in all_divs:
        try:
            quote_text = div.find('span', class_='text').text
            #find the author
            author = div.find('small', class_='author').text
            #print(author)
            tags = div.find('div',class_='tags').find_all('a')
            tags_list=[]
            for tag in tags:
                tags_list.append(tag.text)
                
            quote_dict = {'quote':quote_text,
                         'author':author,
                         'tags':tags_list}
            all_quotes.append(quote_dict)
        except AttributeError as e:
            print(e)
    browser.links.find_by_partial_text('Next').click()
    # using sleep
    time.sleep(1)

df = pd.DataFrame(all_quotes)
df

Unnamed: 0,quote,author,tags
0,“The world as we have created it is a process ...,Albert Einstein,"[change, deep-thoughts, thinking, world]"
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling,"[abilities, choices]"
2,“There are only two ways to live your life. On...,Albert Einstein,"[inspirational, life, live, miracle, miracles]"
3,"“The person, be it gentleman or lady, who has ...",Jane Austen,"[aliteracy, books, classic, humor]"
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe,"[be-yourself, inspirational]"
...,...,...,...
85,“Some day you will be old enough to start read...,C.S. Lewis,"[age, fairytales, growing-up]"
86,“We are not necessarily doubting that God will...,C.S. Lewis,[god]
87,“The fear of death follows from the fear of li...,Mark Twain,"[death, life]"
88,“A lie can travel half way around the world wh...,Mark Twain,"[misattributed-mark-twain, truth]"


In [None]:
# I was able to get quotes and author for 9 pages