# XML Data

## Check out XML & Python documentation

https://www.w3.org/TR/xml/#sec-origin-goals

## Parsing XML Data - couple different ways you can do it. By strings in multiple files or by files.
## xml.etree.ElementTree as ET
#### Element tree module reads the entire XML file in memory. Element objects allow us to iterate over the root elements to find their children and the tags associated with it.

https://www.w3schools.com/xml/xpath_syntax.asp

In [1]:
## example code to explain parsing XML files
## number of attributes for elements - tag & title are examples used

import xml.etree.ElementTree as ET
import pprint

tree = ET.parse('exampleResearchArticle.xml')
root = tree.getroot()

# like specifying a path for a file, you can specify a xml element
# to look for
title = root.find('./fm/bibl/title')
title_text = ""
for p in title:
    title_text += p.text
print ("\nTitle:\n", title_text)

print("\nAuthor email address:")
for a in root.findall('./fm/bibl/aug/au'):
    email = a.find('email')
    if email is not None:
        print email.text
        
# .text grabs the value in the xml <tag>

# to store in a dictionary, you have to file through the root
# and store the values by .find & .text

# EXAMPLE!

# def get_root(fname):
#     tree = ET.parse(fname)
#     return tree.getroot()


# def get_authors(root):
#     authors = []
#     for author in root.findall('./fm/bibl/aug/au'):
#         data = {
#                 "fnm": None,
#                 "snm": None,
#                 "email": None
#         }

#         data['fnm'] = author.find("fnm").text
#         data['snm'] = author.find("snm").text
#         data['email'] = author.find("email").text
        
#         authors.append(data)

#     return authors


SyntaxError: Missing parentheses in call to 'print' (<ipython-input-1-6b1b0d20f8eb>, line 14)

# Intro to Screen Scraping

In [None]:
# inspecting the elements of a web page, you can see some of the data in the xml file

# building list of values, making the http request to to download data, and then parsing the
# data last is best practice for web scraping & parsing the data

In [2]:
form bs4 import BeautifulSoup

def options(soup, id):
    options_values = []
    # the carrier_list finds the 'CarrierList' element 
    # from the 'soup' variable called in main() 
    carrier_list = soup.find(id=id)
    # find_all then makes a list of all 'option' descendents in the 'CarrierList'
    # tag found from the Beautiful Soup html doc
    for option in carrier_list.find_all('option'):
        option_values.append(option['value'])
    return option_values

def print_list(label, codes):
    print "\n%s:" % label
    for c in codes:
        print (c)

def main():
    # file not available in computer
    # BeautifulSoup passes the top level element for the html document
    soup = BeautifulSoup(open("virgin_and_logan_airport.html"))
    
    #using options function, all CarrierList variables are grabbed in a list
    codes = options(soup, 'CarrierList')
    print_list("Carriers", codes)
    
    codes = options(soup, 'AirportList')
    print_list("Airports", codes)


SyntaxError: invalid syntax (<ipython-input-2-9471ab61e30a>, line 1)

### Anytime you are doing a scraping task, you have to understand how a site expects requests. The first step is figuring out what url to access and what http method to use.  The latter is found in the "method" variable and the former is found by searching in the code. Sometimes it is obvious, sometimes it isn't.

### We need to look how we are going to prpogrammatically grab the data. The best way to do this is by looking at the browser and find out the requests it makes. By looking at the 'Network' tab, we can see a running tally of requests made. 'Post' can be found when this request is made.

### Clicking on the post request, we can find what data is submitted when looking at the 'Form Data' section. Thinking that we would only need the carrier and airport fields, we find out that there are other sections needing to be submitted outside of carrier and airport.

## Quiz Below to grab all values from other tags - ex. 'EVENTVALIDATION'

In [None]:
from bs4 import BeautifulSoup
import requests
import json

html_page = "page_source.html"


def extract_data(page):
    data = {"eventvalidation": "",
            "viewstate": ""}
    with open(page, "r") as html:
        soup = BeautifulSoup(html, 'lxml')
        
        ev = soup.find(id='__EVENTVALIDATION')
        data['eventvalidation'] = ev["value"]
        
        vs = soup.find(id='__VIEWSTATE')
        data['viewstate'] = vs["value"]
        
        pass

    return data


def make_request(data):
    eventvalidation = data["eventvalidation"]
    viewstate = data["viewstate"]

    r = requests.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
                    data={'AirportList': "BOS",
                          'CarrierList': "VX",
                          'Submit': 'Submit',
                          "__EVENTTARGET": "",
                          "__EVENTARGUMENT": "",
                          "__EVENTVALIDATION": eventvalidation,
                          "__VIEWSTATE": viewstate
                    })

    return r.text


def test():
    data = extract_data(html_page)
    assert data["eventvalidation"] != ""
    assert data["eventvalidation"].startswith("/wEWjAkCoIj1ng0")
    assert data["viewstate"].startswith("/wEPDwUKLTI")

    
test()

In [None]:
## populating eventvalidation and viewstate with list of all values for the tags in xml

import requests
from bs4 import BeautifulSoup

s = requests.Session()

r = s.get("http://www.transtats.bts.gov/Data_Elements.apx?Data=2")
soup = BeautifulSoup(r.text)
viewstate_element = soup.find(id="__VIEWSTATE")
viewstate = viewstate_element["value"]
eventvalidation_element = soup.find(id="__EVENTVALIDATION")
eventvalidation = eventvalidationi_element["value"]

r = s.post("http://www.transtats.bts.gov//Data_Elements.aspx?Data=2",
                 data = {'AirportList' : 'BOS',
                        'CarrierList' : "VX",
                        'Submit' : "Submit",
                        '__EVENTTARGET' : "",
                        '__EVENTARGUMENT' : "",
                        '__EVENTVALIDATION' : eventvalidation,
                        '__VIEWSTATE' : viewstate})

f = open("virgin_and_logan_airport.html", "w")
f.write(r.text)

## Best Practices for Scraping

### 1) Look at how a browser makes request
### 2) Emulate in code
### 3) If stuff blows up, look at your http traffic
### 4) Return to 1 until it works

## Quiz 1

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Your task in this exercise is to modify 'extract_carrier()` to get a list of
all airlines. Exclude all of the combination values like "All U.S. Carriers"
from the data that you return. You should return a list of codes for the
carriers.

All your changes should be in the 'extract_carrier()' function. The
'options.html' file in the tab above is a stripped down version of what is
actually on the website, but should provide an example of what you should get
from the full file.

Please note that the function 'make_request()' is provided for your reference
only. You will not be able to to actually use it from within the Udacity web UI.
"""

from bs4 import BeautifulSoup
html_page = "options.html"


def extract_carriers(page):
    data = []

    with open(page, "r") as html:
        # do something here to find the necessary values
        soup = BeautifulSoup(html, "lxml")
        carrier_list = soup.find(id="CarrierList")
        for carriers in carrier_list.find_all('option'):
            if "All" not in carriers['value']:
                data.append(carriers['value'])
                print data
        
        # target = soup.find(id='__EVENTTARGET')
        # data['eventtarget'] = target['value']
        
        # argument = soup.find(id='__EVENTARGUMENT')
        # data['eventargument'] = argument['value']
        
    return data


def make_request(data):
    eventvalidation = data["eventvalidation"]
    viewstate = data["viewstate"]
    airport = data["airport"]
    carrier = data["carrier"]

    r = s.post("https://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
               data = (("__EVENTTARGET", ""),
                       ("__EVENTARGUMENT", ""),
                       ("__VIEWSTATE", viewstate),
                       ("__VIEWSTATEGENERATOR",viewstategenerator),
                       ("__EVENTVALIDATION", eventvalidation),
                       ("CarrierList", carrier),
                       ("AirportList", airport),
                       ("Submit", "Submit")))

    return r.text


def test():
    data = extract_carriers(html_page)
    assert len(data) == 16
    assert "FL" in data
    assert "NK" in data

if __name__ == "__main__":
    test()

## Quiz 2

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Complete the 'extract_airports()' function so that it returns a list of airport
codes, excluding any combinations like "All".

Refer to the 'options.html' file in the tab above for a stripped down version
of what is actually on the website. The test() assertions are based on the
given file.
"""

from bs4 import BeautifulSoup
html_page = "options.html"


def extract_airports(page):
    data = []
    with open(page, "r") as html:
        # do something here to find the necessary values
        soup = BeautifulSoup(html, "lxml")
        airports = soup.find(id='AirportList')
        for airport in airports.find_all('option'):
            if "All" not in airport['value']:
                data.append(airport['value'])
        for c in data:
            print c
    return data


def test():
    data = extract_airports(html_page)
    assert len(data) == 15
    assert "ATL" in data
    assert "ABR" in data

if __name__ == "__main__":
    test()

## Quiz 3 - this is impossible

In [3]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Let's assume that you combined the code from the previous 2 exercises with code
from the lesson on how to build requests, and downloaded all the data locally.
The files are in a directory "data", named after the carrier and airport:
"{}-{}.html".format(carrier, airport), for example "FL-ATL.html".

The table with flight info has a table class="dataTDRight". Your task is to
use 'process_file()' to extract the flight data from that table as a list of
dictionaries, each dictionary containing relevant data from the file and table
row. This is an example of the data structure you should return:

data = [{"courier": "FL",
         "airport": "ATL",
         "year": 2012,
         "month": 12,
         "flights": {"domestic": 100,
                     "international": 100}
        },
         {"courier": "..."}
]

Note - year, month, and the flight data should be integers.
You should skip the rows that contain the TOTAL data for a year.

There are couple of helper functions to deal with the data files.
Please do not change them for grading purposes.
All your changes should be in the 'process_file()' function.

The 'data/FL-ATL.html' file in the tab above is only a part of the full data,
covering data through 2003. The test() code will be run on the full table, but
the given file should provide an example of what you will get.
"""
from bs4 import BeautifulSoup
from zipfile import ZipFile
import os

datadir = "data"


def open_zip(datadir):
    with ZipFile('{0}.zip'.format(datadir), 'r') as myzip:
        myzip.extractall()


def process_all(datadir):
    files = os.listdir(datadir)
    return files


def process_file(f):
    """
    This function extracts data from the file given as the function argument in
    a list of dictionaries. This is example of the data structure you should
    return:

    data = [{"courier": "FL",
             "airport": "ATL",
             "year": 2012,
             "month": 12,
             "flights": {"domestic": 100,
                         "international": 100}
            },
            {"courier": "..."}
    ]


    Note - year, month, and the flight data should be integers.
    You should skip the rows that contain the TOTAL data for a year.
    """
    data = []
    info = {}
    info["courier"], info["airport"] = f[:6].split("-")
    # Note: create a new dictionary for each entry in the output data list.
    # If you use the info dictionary defined here each element in the list 
    # will be a reference to the same info dictionary.
    with open("{}/{}".format(datadir, f), "r") as html:
        soup = BeautifulSoup(html, "lxml")
        relevant_html = soup.find('table', class_='dataTDRight')
        all_trs = relevant_html.find_all("tr")
        for tr in all_trs:
            if 'style' in tr.attrs and tr['style'] == "color:White;background-color:#5D95C9;": 
                pass
            elif 'style' in tr.attrs and tr['style'] == "background-color:LightYellow;": 
                pass
            else: 
            
                row = []
                tr_info = {}
                tr_info["courier"] = info["courier"]
                tr_info["airport"] = info["airport"]
                for cell in tr.find_all("td"):
                    row.append(cell.text)
                tr_info["year"] = int(float(row[0]))
                tr_info["month"] = int(float(row[1]))
                tr_info["flights"] = {
                                        "domestic": int(float(row[2].replace(',',''))),
                                        "international": int(float(row[3].replace(',','')))}
                data.append(tr_info)
    return data


def test():
    print "Running a simple test..."
    open_zip(datadir)
    files = process_all(datadir)
    data = []
    # Test will loop over three data files.
    for f in files:
        data += process_file(f)
        
    assert len(data) == 399  # Total number of rows
    for entry in data[:3]:
        assert type(entry["year"]) == int
        assert type(entry["month"]) == int
        assert type(entry["flights"]["domestic"]) == int
        assert len(entry["airport"]) == 3
        assert len(entry["courier"]) == 2
    assert data[0]["courier"] == 'FL'
    assert data[0]["month"] == 10
    assert data[-1]["airport"] == "ATL"
    assert data[-1]["flights"] == {'international': 108289, 'domestic': 701425}
    
    print "... success!"

if __name__ == "__main__":
    test()

SyntaxError: invalid syntax (<ipython-input-3-0e65b1aeeb81>, line 89)

## Quiz 4

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This and the following exercise are using US Patent database. The patent.data
file is a small excerpt of much larger datafiles that are available for
download from US Patent website. These files are pretty large ( >100 MB each).
The original file is ~600MB large, you might not be able to open it in a text
editor.

The data itself is in XML, however there is a problem with how it's formatted.
Please run this script and observe the error. Then find the line that is
causing the error. You can do that by just looking at the datafile in the web
UI, or programmatically. For quiz purposes it does not matter, but as an
exercise we suggest that you try to do it programmatically.

NOTE: You do not need to correct the error - for now, just find where the error
is occurring.
"""

import xml.etree.ElementTree as ET

PATENTS = 'patent.data'

def get_root(fname):

    tree = ET.parse(fname)
    return tree.getroot()


get_root(PATENTS)

## Quiz 6

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# So, the problem is that the gigantic file is actually not a valid XML, because
# it has several root elements, and XML declarations.
# It is, a matter of fact, a collection of a lot of concatenated XML documents.
# So, one solution would be to split the file into separate documents,
# so that you can process the resulting files as valid XML documents.

import xml.etree.ElementTree as ET
PATENTS = 'patent.data'

def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()


def split_file(filename):
    """
    Split the input file into separate files, each containing a single patent.
    As a hint - each patent declaration starts with the same line that was
    causing the error found in the previous exercises.
    
    The new files should be saved with filename in the following format:
    "{}-{}".format(filename, n) where n is a counter, starting from 0.
    """
    with open(filename, 'r') as f_in:
        for line in f_in:
            if line.find('xml version="1.0"'):
                line.split()
    pass


def test():
    split_file(PATENTS)
    for n in range(4):
        try:
            fname = "{}-{}".format(PATENTS, n)
            f = open(fname, "r")
            if not f.readline().startswith("<?xml"):
                print "You have not split the file {} in the correct boundary!".format(fname)
            f.close()
        except:
            print "Could not find file {}. Check if the filename is correct!".format(fname)


test()