# Web Scrapping

Web scrapping is used to extract data from publicly available websites in automated fashion. The method is useful when the public website you want to get data from does not have an API, or it does but provides only limited access to the data. urllib and requests are Python modules used for web requests in web scraping.

* Urllib package is the URL handling module for python. It is used to fetch URLs (Uniform Resource Locators).
* Beautiful Soup is a library that makes it easy to scrape information from web pages. 

In [1]:
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a web page

from urllib.request import urlopen  #provides APIs to establish a non-streaming connection with target servers
from urllib.error import HTTPError
from urllib.error import URLError

In [2]:
html = urlopen('https://www.ncses.nsf.gov/about')
bs = BeautifulSoup(html.read(), 'html.parser')
print(bs.h1)
print(bs.h2)
print(bs.h3)
print(bs.find_all(["h1", "h2"]));
print(bs.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]));

<h1>National Center for Science and Engineering Statistics</h1>
<h2>Who We Are</h2>
<h3 class="card-title" data-property="title"><p style="font-style:italic">
Principles and Practices for a Federal Statistical Agency</p></h3>
[<h1>National Center for Science and Engineering Statistics</h1>, <h1>About NCSES</h1>, <h2>Who We Are</h2>, <h2 class="blue">Our Mission</h2>, <h2 class="blue">Our Core Activities</h2>, <h2 class="blue">Our Products</h2>, <h2 class="blue">How We Support Research</h2>]
[<h1>National Center for Science and Engineering Statistics</h1>, <h1>About NCSES</h1>, <h2>Who We Are</h2>, <h2 class="blue">Our Mission</h2>, <h3 class="card-title" data-property="title"><p style="font-style:italic">
Principles and Practices for a Federal Statistical Agency</p></h3>, <h2 class="blue">Our Core Activities</h2>, <h2 class="blue">Our Products</h2>, <h2 class="blue">How We Support Research</h2>, <h3 class="card-title" data-property="title">Research</h3>, <h3 class="card-title" data-pro

In [None]:
try:
    html = urlopen("https://www.ncses.nsf.gov/about")
except HTTPError as e:
    print("The server returned an HTTP error")
except URLError as e:
    print("The server could not be found!")
else:
    print(html.read())

In [None]:
print(bs.prettify())

In [None]:
tag_object=bs.title
print("tag object:",tag_object)

In [None]:
print("tag object type:",type(tag_object))

In [None]:
resp = requests.get('https://www.ncses.nsf.gov/about')
print(resp.content)

In [None]:
html = urlopen('http://www.pythonscraping.com/pages/warandpeace.html')
bs2 = BeautifulSoup(html, "html.parser")

In [None]:
nameList = bs2.findAll('span', {'class': 'green'})
for name in nameList:
    print(name.get_text())

In [None]:
import socket

HOST = 'www.google.com'  # Server hostname or IP address
PORT = 80                # The standard port for HTTP is 80, for HTTPS it is 443

client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server_address = (HOST, PORT)
client_socket.connect(server_address)

request_header = b'GET / HTTP/1.0\r\nHost: www.google.com\r\n\r\n'
client_socket.sendall(request_header)

response = ''
while True:
    recv = client_socket.recv(1024)
    if not recv:
        break
    response += str(recv)

print(response)
client_socket.close()


In [None]:
import re

html_content = '<p>Price : 19.99$</p>'

m = re.match('<p>(.+)<\/p>', html_content)
if m:
    print(m.group(1))

In [None]:
import urllib3
http = urllib3.PoolManager()
r = http.request('GET', 'http://www.google.com')
print(r.data)


In [None]:
from lxml import html

# We reuse the response from urllib3
data_string = r.data.decode('utf-8', errors='ignore')

# We instantiate a tree object from the HTML
tree = html.fromstring(data_string)

# We run the XPath against this HTML
# This returns an array of element
links = tree.xpath('//a')

for link in links:
    # For each element we can easily get back the URL
    print(link.get('href'))