# Web Scrapping

Web scrapping is used to extract data from publicly available websites in automated fashion. The method is useful when the public website you want to get data from does not have an API, or it does but provides only limited access to the data.

In [2]:
from urllib.request import urlopen

html = urlopen('https://www.ncses.nsf.gov/about')
print(html.read())

b'<!DOCTYPE html>\n<html lang="en">\n<head>\n\t<title>About | NSF - National Science Foundation</title>\n\t<meta http-equiv="X-UA-Compatible" content="IE=11" />\n\t<meta name="viewport" content="width=device-width, initial-scale=1" />\n\t<meta charset="UTF-8" />\n\t<meta http-equiv="Content-Type" content="text/html;charset=UTF-8" />\n\t<link rel="shortcut icon" href="/resources/assets/images/statistics/favicon.ico" type="image/x-icon" />\n    <link rel="preload" as="font" type="font/woff2" href="/resources/assets/fonts/fontawesome-webfont.woff2?v=4.7.0" crossorigin="true"/>\n    \n    <link rel="stylesheet" href="/resources/assets/css/pages/statistics/bootstrap.min.css" />\n\t<script src="/resources/assets/js/pages/statistics/statistics.concat.js"></script>\n\n\n    <link rel="preload" as="font" type="font/woff2" href="/resources/assets/fonts/fontawesome-webfont.woff2?v=4.7.0" crossorigin="true"/>\n    <link rel="stylesheet" href="/resources/assets/css/pages/statistics/default-without-

In [9]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('https://www.ncses.nsf.gov/about')
bs = BeautifulSoup(html.read(), 'html.parser')
print(bs.h1)
print(bs.h2)
print(bs.h3)
print(bs.find_all(["h1", "h2"]));
# print(bs.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]));

<h1>National Center for Science and Engineering Statistics</h1>
<h2 class="blue">Our Mission</h2>
<h3 class="card-title" data-property="title"><p style="font-style:italic">
Principles and Practices for a Federal Statistical Agency</p></h3>
[<h1>National Center for Science and Engineering Statistics</h1>, <h1>About NCSES</h1>, <h2 class="blue">Our Mission</h2>, <h2 class="blue">Our Core Activities</h2>, <h2 class="blue">Our Products</h2>, <h2 class="blue">How We Support Research</h2>]


In [4]:
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError

try:
    html = urlopen("https://www.ncses.nsf.gov/about")
except HTTPError as e:
    print("The server returned an HTTP error")
except URLError as e:
    print("The server could not be found!")
else:
    print(html.read())

b'<!DOCTYPE html>\n<html lang="en">\n<head>\n\t<title>About | NSF - National Science Foundation</title>\n\t<meta http-equiv="X-UA-Compatible" content="IE=11" />\n\t<meta name="viewport" content="width=device-width, initial-scale=1" />\n\t<meta charset="UTF-8" />\n\t<meta http-equiv="Content-Type" content="text/html;charset=UTF-8" />\n\t<link rel="shortcut icon" href="/resources/assets/images/statistics/favicon.ico" type="image/x-icon" />\n    <link rel="preload" as="font" type="font/woff2" href="/resources/assets/fonts/fontawesome-webfont.woff2?v=4.7.0" crossorigin="true"/>\n    \n    <link rel="stylesheet" href="/resources/assets/css/pages/statistics/bootstrap.min.css" />\n\t<script src="/resources/assets/js/pages/statistics/statistics.concat.js"></script>\n\n\n    <link rel="preload" as="font" type="font/woff2" href="/resources/assets/fonts/fontawesome-webfont.woff2?v=4.7.0" crossorigin="true"/>\n    <link rel="stylesheet" href="/resources/assets/css/pages/statistics/default-without-