# Screen Scraping

In [1]:
import requests

import bs4

import html5lib

from IPython.display import clear_output

import time

# Lab: Beautiful Soup - Screen Scraping Web Pages Using Python

## Recall that screen scraping is a "last resort" when an API is not available; legal issues, ethical issues, term of service issues, etc.; inexact as we shall see

## Beautiful Soup is a Python module that allows us to walk through HTML that has been parsed by a parser such as html5lib;  

## Note that a lot of web page content is generated by client side scripts; to parse this, we must have a browser emulator that runs the scripts;  what we are doing here will not be able to emulate a browser and run scripts, for that we need commercial software products


## Get the HTML for our landing page

In [2]:
r = requests.get("https://nginx", verify=False)



In [3]:
r.status_code

200

In [4]:
r.text[:500]

'<!DOCTYPE html>\n\n<html>\n    \n<body>\n    \n    <title> Landing Page </title>\n\n    <h1> Welcome to the Landing Page for this Web Server! </h1>\n    \n    <h2> In the beginning, html was simply content without formatting, like this very simple landing page. </h2>\n    \n    <h2> Next, html added formatting inside the html. </h2>\n    \n    <h2> Next, content and formatting were separated: \n        content was placed in html files, \n        while formatting was placed in css (cascading style sheets) files.'

## Create a beautiful soup object from the HTML on our landing page

In [5]:
soup = bs4.BeautifulSoup(r.text, "html5lib")

## Beautiful soup can pretty print the HTML

In [6]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
 </head>
 <body>
  <title>
   Landing Page
  </title>
  <h1>
   Welcome to the Landing Page for this Web Server!
  </h1>
  <h2>
   In the beginning, html was simply content without formatting, like this very simple landing page.
  </h2>
  <h2>
   Next, html added formatting inside the html.
  </h2>
  <h2>
   Next, content and formatting were separated: 
        content was placed in html files, 
        while formatting was placed in css (cascading style sheets) files.
  </h2>
  <h2>
   Today, RWD (responsive web design) is the order of the day.  
        RWD sites are able to display properly on all platforms from desktops to laptops to tablets to phones.  
        The most widely used framework for RWD is the open source Bootstrap.  
        The Bootstrap examples have been downloaded into the static folder, so you can run them 
        using Flask from the links below.
  </h2>
  <h1>
   RWD Examples using Bootstrap 5 (be sure and also try them on a pho

## One very useful thing to do in screen scraping is get all the non-html text on the page;  this is often a "first cut" to see if it's parseable; how much trouble it will be to parse; etc.;

In [7]:
print(soup.get_text())


    
     Landing Page 

     Welcome to the Landing Page for this Web Server! 
    
     In the beginning, html was simply content without formatting, like this very simple landing page. 
    
     Next, html added formatting inside the html. 
    
     Next, content and formatting were separated: 
        content was placed in html files, 
        while formatting was placed in css (cascading style sheets) files.  
    
     Today, RWD (responsive web design) is the order of the day.  
        RWD sites are able to display properly on all platforms from desktops to laptops to tablets to phones.  
        The most widely used framework for RWD is the open source Bootstrap.  
        The Bootstrap examples have been downloaded into the static folder, so you can run them 
        using Flask from the links below.
   
    
    RWD Examples using Bootstrap 5 (be sure and also try them on a phone and tablet to see how RWD works on those):
    
    Cheatsheet
    
    Blog
    
    Dashboa

## Another very useful thing to do in screen scraping is to get a list of links on a page; this can be used to create a "web crawler" that recursively goes down all links

In [8]:
for link in soup.find_all('a'):
    print(link.get('href'))

bootstrap-5.1.3-examples/cheatsheet/index.html
bootstrap-5.1.3-examples/blog/index.html
bootstrap-5.1.3-examples/dashboard/index.html
bootstrap-5.1.3-examples/pricing/index.html
bootstrap-5.1.3-examples/checkout/index.html
bootstrap-5.1.3-examples/carousel/index.html
bootstrap-5.1.3-examples/headers/index.html
bootstrap-5.1.3-examples/heroes/index.html
bootstrap-5.1.3-examples/features/index.html
bootstrap-5.1.3-examples/sidebars/index.html
bootstrap-5.1.3-examples/footers/index.html
bootstrap-5.1.3-examples/dropdowns/index.html
bootstrap-5.1.3-examples/list-groups/index.html
bootstrap-5.1.3-examples/modals/index.html
bootstrap-5.1.3-examples/album/index.html
bootstrap-5.1.3-examples/product/index.html
bootstrap-5.1.3-examples/cover/index.html
bootstrap-5.1.3-examples/sign-in/index.html
bootstrap-5.1.3-examples/sticky-footer/index.html
bootstrap-5.1.3-examples/sticky-footer-navbar/index.html
bootstrap-5.1.3-examples/jumbotron/index.html
bootstrap-5.1.3-examples/starter-template/index.h

## A simple 1 level deep web crawler; crawl all the links on our landing page, and for each link, display the non-html text on that page

In [9]:
sleep_interval = 3.0

for link in soup.find_all('a'):
    
    href = link.get('href')
    
    clear_output(wait=True)
    
    print("\n\n")
    print("---------------------------------------------------------")
    print(href)
    print("---------------------------------------------------------")
    
    
    
    r = requests.get("https://nginx/" + href, verify=False)
    
    if r.status_code != 200:
        
        print("Error - not able to get the link!")
        
    else:
        
        temp_soup = bs4.BeautifulSoup(r.text, "html5lib")
        
        print(temp_soup.get_text())
        
    time.sleep(sleep_interval)
        




---------------------------------------------------------
bootstrap-5.1.3-examples/offcanvas-navbar/index.html
---------------------------------------------------------

    
    
    
    
    
    Offcanvas navbar template Â· Bootstrap v5.1

    

    

    


    
      .bd-placeholder-img {
        font-size: 1.125rem;
        text-anchor: middle;
        -webkit-user-select: none;
        -moz-user-select: none;
        user-select: none;
      }

      @media (min-width: 768px) {
        .bd-placeholder-img-lg {
          font-size: 3.5rem;
        }
      }
    

    
    
    
  
  
    

  
    Offcanvas navbar
    
      
    

    
      
        
          Dashboard
        
        
          Notifications
        
        
          Profile
        
        
          Switch account
        
        
          Settings
          
            Action
            Another action
            Something else here
          
        
      
      
        
        Search
     



## You try it - try screen scraping google.com; get the non-html text and a list of links; also try parsing a google search result, such as google.com/search?q=data+science; solutions are in screen_scraping_solutions

In [10]:
r = requests.get("https://google.com")

In [11]:
r.status_code

200

In [12]:
soup = bs4.BeautifulSoup(r.text, "html5lib")

In [13]:
for link in soup.find_all('a'):
    print(link.get('href'))

https://www.google.com/imghp?hl=en&tab=wi
https://maps.google.com/maps?hl=en&tab=wl
https://play.google.com/?hl=en&tab=w8
https://www.youtube.com/?tab=w1
https://news.google.com/?tab=wn
https://mail.google.com/mail/?tab=wm
https://drive.google.com/?tab=wo
https://www.google.com/intl/en/about/products?tab=wh
http://www.google.com/history/optout?hl=en
/preferences?hl=en
https://accounts.google.com/ServiceLogin?hl=en&passive=true&continue=https://www.google.com/&ec=GAZAAQ
/advanced_search?hl=en&authuser=0
/intl/en/ads/
/services/
/intl/en/about.html
/intl/en/policies/privacy/
/intl/en/policies/terms/


In [15]:
print(soup.get_text())

Google(function(){var _g={kEI:'ODTjZ_ecHouW5OMPmd6UoAU',kEXPI:'0,202792,3497518,639,435,538661,48791,46127,344796,100064,128055,10339,8862,23156,13154,6414,5241722,17,15,34,8834859,79,1,2,4,2,3,1,23933876,4043709,25227439,1242,46375,4815,5,60926,11867,14280,8181,5934,8939,44486,8,9074,2661,3433,3319,23879,9139,739,1,3,3856,328,6225,1116,33194,29855,1346,5402,8301,8210,3286,4134,12133,13884,4363,17667,10666,21344,2987,1514,4,3836,41,13162,477,1,2619,2303,616,1203,2847,1258,353,4100,3284,3347,1609,6540,5870,950,2150,4607,7,5774,4309,4666,7987,3038,5822,1789,4621,2,2645,4895,971,1755,678,957,3261,459,2531,28,7,1,1293,2126,2863,1,893,55,776,2,953,125,9,209,1,2947,1362,5,75,1490,1,36,1680,570,1791,576,1822,1297,1816,1,2039,503,1339,354,532,2,124,609,3143,808,12,447,6,99,109,4320,247,2,954,301,1307,265,663,4293,1375,1738,518,126,1582,894,27,149,667,197,42,1494,7,1,1122,393,2063,88,629,1,1321,1627,476,444,996,210,316,594,1287,455,717,2,629,819,2,328,1414,3,234,2,5,110,2,1,2,2,2,3,71,77,572,17

In [16]:
r = requests.get("https://google.com/search?q=data+science")

In [17]:
r.status_code

200

In [18]:
soup = bs4.BeautifulSoup(r.text, "html5lib")

In [19]:
print(soup.get_text())

Google Searchbody{background-color:#fff}table,div,span,p{display:none}Please click here if you are not redirected within a few seconds.//# sourceMappingURL=data:application/json;charset=utf-8;base64,eyJ2ZXJzaW9uIjogMywic291cmNlcyI6WyIiXSwic291cmNlc0NvbnRlbnQiOlsiICJdLCJuYW1lcyI6WyJjbG9zdXJlRHluYW1pY0J1dHRvbiJdLCJtYXBwaW5ncyI6IkFBQUE7QUFBQTtBQUFBO0FBQUE7QUFBQTtBQUFBO0FBQUEifQ==
(function(){function K(e){return e}var N=this||self,t=function(e){return K.call(this,e)},x=function(e,m,p,q,b,w,g,F,h,c,O,I){for(c=(I=14,61);;)try{if(I==86)break;else if(I==q)c=53,F=h.createPolicy(w,{createHTML:t,createScript:t,createScriptURL:t}),I=29;else if(I==55)I=N.console?46:29;else if(I==46)N.console[b](O.message),I=29;else if(I==p)c=61,I=55;else{if(I==m)return F;if(I==14)h=N.trustedTypes,F=g,I=e;else if(I==e)I=h&&h.createPolicy?q:m;else if(I==29)return c=61,F}}catch(u){if(c==61)throw u;c==53&&(O=u,I=p)}};(0,eval)(function(e,m){return(m=x(43,33,66,28,"error","ks",null))&&e.eval(m.createScript("1"))===1?fun

In [20]:
for link in soup.find_all('a'):
    print(link.get('href'))

/httpservice/retry/enablejs?sei=9DTjZ77eKvOOxc8PqL79mA0
/search?q=data+science&sca_esv=dae00c36fa947bf4&ie=UTF-8&emsg=SG_REL&sei=9DTjZ77eKvOOxc8PqL79mA0
https://support.google.com/websearch
