# Using python to access web data

- Serialization 
- XML (eXtensible Markup Language)
- JSON (Javascript object notation)

In [1]:
import xml.etree.ElementTree as ET

data = '''
<person>
  <name>Chuck</name>
  <phone type="intl">
    +1 734 303 4456
  </phone>
  <email hide="yes" />
</person>'''

# call of the method fromstring to parse the xml data to an object
tree = ET.fromstring(data)
# extract the name from the xml element
name = tree.find('name').text
email = tree.find('email').get('hide')
print('Name:', name)
print('Attr:', email)

Name: Chuck
Attr: yes


# Worked example: Xml 
## Example 2: cas d'un fichier XML, findall method of ElemeentTree

In [2]:
import xml.etree.ElementTree as ET

input = '''
<stuff>
  <users>
    <user x="2">
      <id>001</id>
      <name>Chuck</name>
    </user>
    <user x="7">
      <id>009</id>
      <name>Brent</name>
    </user>
  </users>
</stuff>'''

stuff = ET.fromstring(input)
lst = stuff.findall('users/user')
print('User count:', len(lst))

for item in lst:
    print('Name', item.find('name').text)
    print('Id', item.find('id').text)
    print('Attribute', item.get('x'))


User count: 2
Name Chuck
Id 001
Attribute 2
Name Brent
Id 009
Attribute 7


### web scraping 
- utiliser urllib pour extraire des données sur le web
- BeautifulSoup pour parser du html


In [3]:
import urllib.request, urllib.error, urllib.response
import xml.etree.ElementTree as ET
import ssl

# creation d'une chaine de caractère 
def convertXMLToString(fhand):
    message = ""
    for line in fhand:
        word = line.decode() 
        message += word
    return message

dataString = convertXMLToString(fhand)
print(dataString)

fhand = urllib.request.urlopen("http://py4e-data.dr-chuck.net/comments_193217.xml")
dataString = convertXMLToString(fhand)
data = ET.fromstring(dataString)
comments = data.findall('comments/comment')
comment_dict = {}
for comment in comments:
    name = comment.find('name').text
    count = comment.find('count').text
    comment_dict[name] = comment_dict.get(name, 0) + int(count)
comment_dict
sum(comment_dict.values())
    


NameError: name 'fhand' is not defined

## JSON data format

In [None]:
import json
data = '''
[
  { "id" : "001",
    "x" : "2",
    "name" : "Chuck"
  } ,
  { "id" : "009",
    "x" : "7",
    "name" : "Brent"
  }
]'''

info = json.loads(data)
print(info)
for item in info:
    id = item.get('id')
    x = item.get('x')
    name = item.get('name')
    print(" Id: " +id, " Attribute: " + x, " Name:" + name)

In [None]:
import json

dataString = '''
{
  "name" : "Chuck",
  "phone" : {
    "type" : "intl",
    "number" : "+1 734 303 4456"
   },
   "email" : {
     "hide" : "yes"
   }
}'''

# Parsing data into JSON
dataJSON = json.loads(dataString)

# looping through dataJSON
for item in dataJSON:
    data = dataJSON.get(item)
    if type(data) is dict:
        for key in data:
            print(key, data[key])

### Google geocode

In [None]:
import urllib.request, urllib.parse, urllib.error
import json
import ssl

api_key = False
# If you have a Google Places API key, enter it here
# api_key = 'AIzaSy___IDByT70'
# https://developers.google.com/maps/documentation/geocoding/intro

if api_key is False:
    api_key = 42
    serviceurl = 'http://py4e-data.dr-chuck.net/json?'
else :
    serviceurl = 'https://maps.googleapis.com/maps/api/geocode/json?'

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

while True:
    address = input('Enter location: ')
    #address = "Ann Arbor, MI"
    if len(address) < 1: break

    parms = dict()
    parms['address'] = address
    if api_key is not False: parms['key'] = api_key
    url = serviceurl + urllib.parse.urlencode(parms)

    print('Retrieving', url)
    uh = urllib.request.urlopen(url, context=ctx)
    data = uh.read().decode()
    print('Retrieved', len(data), 'characters')

    try:
        js = json.loads(data)
    except:
        js = None

    if not js or 'status' not in js or js['status'] != 'OK':
        print('==== Failure To Retrieve ====')
        print(data)
        continue

    print(json.dumps(js, indent=4))

    lat = js['results'][0]['geometry']['location']['lat']
    lng = js['results'][0]['geometry']['location']['lng']
    print('lat', lat, 'lng', lng)
    location = js['results'][0]['formatted_address']
    print(location)
    break

### Assignment Extracting Data from JSON

In [None]:
import urllib.request, urllib.parse, urllib.error
import json


while True:
    url = input('Enter url:')
    if len(url) < 1: break
    print('Retrieving :', url)
    
    fh = urllib.request.urlopen(url)
    print("Use of a file handle: ", fh)
    
    data = fh.read().decode()
    dataJSON = json.loads(data)
    print("Retrieved ", len(data), 'characters')

    total = 0
    for comment in dataJSON['comments']:
        name = comment.get('name')
        count = comment.get('count')
        icount = int(count)
        count_list.append(icount)
        total += icount
    print(total)
    

# GeoJSON API
In this assignment, we will prompt for a location, call en webservice endpoint and retrieve data in json
 and parse that data and retrieve the place_id

In [None]:
# import of the modules urllib
import urllib.request, urllib.parse, urllib.error
# import json module
import json
import ssl

api_key = False
# If you have a Google Places API key, enter it here
# api_key = 'AIzaSy___IDByT70'
# https://developers.google.com/maps/documentation/geocoding/intro

if api_key is False:
    api_key = 42
    serviceurl = 'http://py4e-data.dr-chuck.net/json?'
else :
    serviceurl = 'https://maps.googleapis.com/maps/api/geocode/json?'

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

while True:
    address = input('Enter location: ')
    if len(address) < 1: break
        
    parms = dict()
    parms['address'] = address
    if api_key is not False: parms['key'] = api_key
    url = serviceurl + urllib.parse.urlencode(parms)

    print('Retrieving', url)
    uh = urllib.request.urlopen(url, context=ctx)
    data = uh.read().decode()
    print('Retrieved', len(data), 'characters')

    try:
        js = json.loads(data)
    except:
        js = None

    if not js or 'status' not in js or js['status'] != 'OK':
        print('==== Failure To Retrieve ====')
        print(data)
        continue

    print(json.dumps(js, indent=4))
    place_id = js["results"][0]["place_id"]
    print(place_id)

Enter location: South Federal University
Retrieving http://py4e-data.dr-chuck.net/json?address=South+Federal+University&key=42
Retrieved 2015 characters
{
    "results": [
        {
            "address_components": [
                {
                    "long_name": "105",
                    "short_name": "105",
                    "types": [
                        "street_number"
                    ]
                },
                {
                    "long_name": "Bol'shaya Sadovaya Ulitsa",
                    "short_name": "Bol'shaya Sadovaya Ulitsa",
                    "types": [
                        "route"
                    ]
                },
                {
                    "long_name": "Rostov-on-Don",
                    "short_name": "Rostov-on-Don",
                    "types": [
                        "locality",
                        "political"
                    ]
                },
                {
                    "long_name": "Rostov Ob