# retrieving data from the internet

In [1]:
import urllib.request

webUrl = urllib.request.urlopen("http://www.google.com")
print ("result code: " + str(webUrl.getcode()))
data = webUrl.read()
print (data)

result code: 200
b'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="en"><head><meta content="Search the world\'s information, including webpages, images, videos and more. Google has many special features to help you find exactly what you\'re looking for." name="description"><meta content="noodp" name="robots"><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"><title>Google</title><script nonce="GfenfFX9kDugu9ocvXDTDA==">(function(){window.google={kEI:\'ZwQvX6C2L4SYkwWbpYPQBA\',kEXPI:\'0,202123,3,4,32,1151585,5662,730,224,5104,207,3204,10,1226,364,926,573,611,206,383,246,5,1128,226,648,2314,668,469,315,3,369,150,140,92,299,90,181,2,11,323,657,104,343,519,1120069,1197793,405,329069,13677,4855,32692,8162,7085,861,17450,11240,9188,8384,4858,1362,283,9008,3022,4745,11033,1808,4020,978,7931,5297,2054,920,873,1217,2975,2784,3646,1142,13386,4516,2778,919,227

# parsing and processing JSON

In [2]:
import urllib.request 
import json

In [4]:
urlData = "http://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/2.5_day.geojson"
webUrl = urllib.request.urlopen(urlData)

if (webUrl.getcode() == 200):
    data = webUrl.read()
else:
    print ("Received an error from server, cannot retrieve results " + str(webUrl.getcode()))

theJSON = json.loads(data)
if "title" in theJSON["metadata"]:
    print (theJSON["metadata"]["title"])

count = theJSON["metadata"]["count"]
print (str(count) + " events recorded")

for i in theJSON["features"]:
    print (i["properties"]["place"])
print ("--------------\n")

for i in theJSON["features"]:
    if i["properties"]["mag"] >= 4.0:
        print ("%2.1f" % i["properties"]["mag"], i["properties"]["place"])
print ("--------------\n")

print ("\n\nEvents that were felt:")
for i in theJSON["features"]:
    feltReports = i["properties"]["felt"]
    if (feltReports != None):
        if (feltReports > 0):
            print ("%2.1f" % i["properties"]["mag"], i["properties"]["place"], 
                   " reported " + str(feltReports) + " times")

USGS Magnitude 2.5+ Earthquakes, Past Day
47 events recorded
32 km SE of Mina, Nevada
77 km SSW of Sand Point, Alaska
Macquarie Island region
12 km NNE of Ferry, Alaska
28 km SSW of Tomohon, Indonesia
142 km SSW of Komodo, Indonesia
81 km SSW of Sand Point, Alaska
36 km NW of Stanley, Idaho
37 km NW of Stanley, Idaho
67 km ESE of King Cove, Alaska
central Mid-Atlantic Ridge
29km SE of Bodie, CA
19km ESE of Little Lake, CA
3 km SE of La Parguera, Puerto Rico
south of Alaska
253 km ESE of Attu Station, Alaska
140 km SSW of Komodo, Indonesia
143 km W of Waingapu, Indonesia
6 km WSW of Guánica, Puerto Rico
118 km SSW of Komodo, Indonesia
146 km SSE of Dompu, Indonesia
123 km SSW of Komodo, Indonesia
42 km WNW of San Antonio de los Cobres, Argentina
46 km S of Tobelo, Indonesia
3 km SSE of La Parguera, Puerto Rico
off the coast of Oregon
243 km WSW of Qamdo, China
42 km ESE of Takahagi, Japan
12 km SW of Stanley, Idaho
150 km SSW of Singkil, Indonesia
63 km SSW of Shungnak, Alaska
11 km NW 

# parsing and processing XML

In [10]:
import xml.dom.minidom

doc = xml.dom.minidom.parse("samplexml.xml")

In [11]:
print (doc.nodeName)
print (doc.firstChild.tagName)

skills = doc.getElementsByTagName("skill")
print ("%d skills:" % skills.length)
for skill in skills:
    print (skill.getAttribute("name"))

newSkill = doc.createElement("skill")
newSkill.setAttribute("name", "jQuery")
doc.firstChild.appendChild(newSkill)

skills = doc.getElementsByTagName("skill")
print ("%d skills:" % skills.length)
for skill in skills:
    print (skill.getAttribute("name"))

#document
person
4 skills:
JavaScript
Python
C#
HTML
5 skills:
JavaScript
Python
C#
HTML
jQuery


# parsing and processing HTML

In [16]:
from html.parser import HTMLParser
import urllib.request


f = open("samplehtml.html")
if f.mode == "r":
    contents = f.read() # read the entire file
    parser.feed(contents)

parser = HTMLParser()


metacount = 0;
class MyHTMLParser(HTMLParser):
  # function to handle an opening tag in the doc
  # this will be called when the closing ">" of the tag is reached
  def handle_starttag(self, tag, attrs):
    global metacount
    if tag == "meta":
      metacount += 1

    print ("Encountered a start tag:", tag)
    pos = self.getpos() # returns a tuple indication line and character
    print ("\tAt line: ", pos[0], " position ", pos[1])

    if attrs.__len__() > 0:
      print ("\tAttributes:")
      for a in attrs:
        print ("\t", a[0],"=",a[1])
      
  # function to handle the ending tag
  def handle_endtag(self, tag):
    print ("Encountered an end tag:", tag)
    pos = self.getpos()
    print ("\tAt line: ", pos[0], " position ", pos[1])
    
  # function to handle character and text data (tag contents)
  def handle_data(self, data):
    if (data.isspace()):
      return
    print ("Encountered some text data:", data)
    pos = self.getpos()
    print ("\tAt line: ", pos[0], " position ", pos[1])
  
  # function to handle the processing of HTML comments
  def handle_comment(self, data):
    print ("Encountered comment:", data)
    pos = self.getpos()
    print ("\tAt line: ", pos[0], " position ", pos[1])


parser = MyHTMLParser()

print ("%d meta tags encountered" % metacount)

Encountered a start tag: html
	At line:  2  position  0
	Attributes:
	 lang = en
Encountered a start tag: head
	At line:  3  position  2
Encountered a start tag: meta
	At line:  4  position  4
	Attributes:
	 charset = utf-8
Encountered an end tag: meta
	At line:  4  position  4
Encountered a start tag: title
	At line:  5  position  4
Encountered some text data: Sample HTML Document
	At line:  5  position  11
Encountered an end tag: title
	At line:  5  position  31
Encountered a start tag: meta
	At line:  6  position  4
	Attributes:
	 name = description
	 content = This is a sample HTML file
Encountered an end tag: meta
	At line:  6  position  4
Encountered a start tag: meta
	At line:  7  position  4
	Attributes:
	 name = author
	 content = Administrator
Encountered an end tag: meta
	At line:  7  position  4
Encountered a start tag: meta
	At line:  8  position  4
	Attributes:
	 name = viewport
	 content = width=device-width; initial-scale=1.0
Encountered an end tag: meta
	At line:  8  p