In [1]:
import json
from lxml import etree
import pandas as pd

# XML
- Extensible Markup Language (`XML`) is widely used **markup language** used to define rules for encoding documents or data structures.
- `XML` is often used in a context to generate and display *data*. 
- `lxml` is a more powerful and feature-rich version of `xml`.
- Notable functions from `lxml` include `etree`, which allows parsing of XML data into a tree-like structure.

In [2]:
# Read file and print out (haven't parsed into a Python object)
with open("sample_xml.xml", "r") as f:
    print(f.read())

<?xml version="1.0"?>
<cinema cinema_id="8941" cinema_name="Vue Cinemas - Reading">
  <showings film_id="227902" film_name="The Predator">
    <standard>
      <start_time>14:30</start_time>
      <date>2018-09-14</date>
    </standard>
    <VMAX>
      <start_time>15:45</start_time>
      <date>2018-09-15</date>
    </VMAX>
  </showings>
  <showings film_id="123456" film_name="Avengers">
    <standard>
      <start_time>17:05</start_time>
      <date>2018-09-15</date>
    </standard>
    <VMAX>
      <start_time>19:25</start_time>
      <date>2018-09-16</date>
    </VMAX>
    <VMAX>
      <start_time>18:00</start_time>
      <date>2018-09-17</date>
    </VMAX>
    <standard>
      <start_time>21:05</start_time>
      <date>2018-09-17</date>
    </standard>
    <VMAX>
      <start_time>10:05</start_time>
      <date>2018-09-18</date>
    </VMAX>
  </showings>
</cinema>


In [3]:
# Parse into a Python object called ETree
xmltree = etree.parse('sample_xml.xml')

# Get the root node
root = xmltree.getroot()

In [4]:
# the name of the tag
print("Tag:", root.tag) 

# the given attributes in a dictionary-like format
print("Attributes:", root.attrib) 

# any text - there is nothing here as there is no text between the opening and closing tags
print("Content:", root.text) 

# the number of sub-elements or children below cinema
# if we look at the example above, we should see 2 "showings"
print("How many sub-elements/children:", len(root))

Tag: cinema
Attributes: {'cinema_id': '8941', 'cinema_name': 'Vue Cinemas - Reading'}
Content: 
  
How many sub-elements/children: 2


In [5]:
# Get a specific attribute. This works like dict.get()
root.get('cinema_id')

'8941'

In [6]:
# Note that this method will return None by default if you're trying to access an attribute that's not there.
# Consider this behaviour when you're writing a loop to access the attributes.
# You can specify a different return value (i.e False) like dict.get()
root.get('some_attribute_that_is_not_there', False)

False

In [7]:
# How to index the first child. 
first_showings = root[0]
print(first_showings)
print("Tag:", first_showings.tag)
print("Attributes:", first_showings.attrib)

<Element showings at 0x202038c37c0>
Tag: showings
Attributes: {'film_id': '227902', 'film_name': 'The Predator'}


In [8]:
# How to index the second child. 
second_showings = root[1]
print(second_showings)
print("Tag:", second_showings.tag)
print("Attributes:", second_showings.attrib)

<Element showings at 0x202038c53c0>
Tag: showings
Attributes: {'film_id': '123456', 'film_name': 'Avengers'}


In [9]:
# Like normal lists, you'll get an index error if the child does not exist
third_showings = root[2]

IndexError: list index out of range

In [None]:
# Get first child of a specific tag
vmax = first_showings.find("VMAX")
print(vmax)
print("Tag:", vmax.tag)
print("Attributes:", vmax.attrib)

# which line this tag appears on (according to the first xml output above)
print("Line number on which this tag appears:", vmax.sourceline)

We can also loop over all sub-elements/child nodes by using the `iterchildren` and `iterdescendants` methods:

In [None]:
# Get all children of a specific tag - in this case, VMAX
for vmax_session in first_showings.iterchildren(tag='VMAX'):
    print(vmax_session.find('date').text)

In [None]:
# Get all descendants
for vmax_session in root.iterdescendants(tag='VMAX'):
    print(vmax_session.find('date').text)

In [None]:
# First we need to create the new cinema as an Element object
new_cinema = etree.Element('cinema')
new_cinema.set("cinema_id", "8932")
new_cinema.set("cinema_name", "Another Cinemas")

# This is a function to "preview" the Element object as a string
print(etree.tostring(new_cinema, # the etree element to show
                     pretty_print=True, # if we want to nicely format the xml with indentation
                     encoding='unicode') # ensure we use unicode
     )

In [None]:
# create a new root node called CinemaList
new_root_node = etree.Element('cinemaList')

# We now copy the 2 <cinema> elements to this <cinemaList> element
# Here, we can append it like we would to a list
new_root_node.append(root) # the original root node
new_root_node.append(new_cinema) # the new element

In [None]:
# Preview our new tree
print(etree.tostring(new_root_node,
                     pretty_print=True,
                     encoding='unicode')
     )

Finally, to write to an `XML` file, use the `.write()` method. It's important to note that we use `wb` to write in *binary* mode. 

In [None]:
# Write to a new xml file
new_tree = etree.ElementTree(new_root_node)

with open('export_cinemaList.xml', 'wb') as f:
    new_tree.write(f, # file to write to
                   xml_declaration=True # to add the prolog
                  )

In [None]:
df_rows = []

cinema_id = root.get('cinema_id')

for film in root.iterchildren(tag='showings'):
    film_id = film.get('film_id')
    
    # Loop through each show time
    for show in film.iterchildren():
        session_type = show.tag
        start_time = show.find('start_time').text
        date = show.find('date').text
        
        df_rows.append({
                'cinema_id': cinema_id,
                'film_id': film_id,
                'session_type': session_type,
                'date': date,
                'start_time': start_time
            })
            
df = pd.DataFrame(df_rows)
df.head()

# JSON
- `JSON` (JavaScript Object Notation) is another common data structure which is supposed to replace the `XML` data structure.
- Works very similar to a Python dictionary.

## Reading in JSON files
- `json.load()` loads a `JSON` object.
- `json.loads()` loads a `JSON` from string (i.e `json.loadSTRING()`, kind of an ambiguous naming convention)

A `JSON` object is a file with the extension `.json`, commonly used in transferring complex data between servers. In Python, JSONs are read into dictionaries and arrays.

In [None]:
sample_json_string = """
{
  "cinema": {
    "cinema_id": 8941,
    "cinema_name": "Vue Cinemas - Reading",
    "showings": [
      {
        "film_id": 227902,
        "film_name": "The Predator",
        "times": [
          {
            "start_time": "14:30",
            "date": "2018-09-14"
          },
          {
            "start_time": "15:45",
            "date": "2018-09-15"
          }
        ]
      },
      {
        "film_id": 123456,
        "film_name": "Avengers",
        "times": [
          {
            "start_time": "17:05",
            "date": "2018-09-15"
          },
          {
            "start_time": "19:25",
            "date": "2018-09-16"
          },
          {
            "start_time": "18:00",
            "date": "2018-09-17"
          },
          {
            "start_time": "21:05",
            "date": "2018-09-17"
          },
          {
            "start_time": "10:05",
            "date": "2018-09-18"
          }
        ]
      }
    ]
  }
}
"""

Now, let's use `.loads()`.

**IMPORTANT**: `JSON` requires double quotes for the keys/values.
For example: `{'key': 'value'}` (incorrect) vs `{"key": "value"}` (correct)

In [None]:
# Load JSON into dict from a string
sample_json_from_string = json.loads(sample_json_string)
print(type(sample_json_from_string))
sample_json_from_string

In [None]:
# Load JSON into dict from a file
with open('sample_json.json') as json_file:
    sample_json = json.load(json_file)

print(type(sample_json))
sample_json

`JSON` works pretty much the same as Python dictionaries, so this would be the equivalent of making a Python dictionary into a pandas DataFrame.


In [None]:
df_rows = []

cinema_id = sample_json['cinema']['cinema_id']
for film in sample_json['cinema']['showings']:
    film_id = film['film_id']
    
    # Loop through each show time
    for show in film['times']:
        start_time = show['start_time']
        date = show['date']
        df_rows.append({
                'cinema_id': cinema_id,
                'film_id': film_id,
                'date': date,
                'start_time': start_time
            })
            
df = pd.DataFrame(df_rows)
df.head()

Export this DataFrame as `JSON` as save as a file named `export_df.json`.

In [None]:
json_str = df.to_json()
json.dump(json.loads(json_str),
          open("export_df.json", "w"))

pd.read_json("export_df.json")

Like `.load()` and `.loads()`, `dump` works the same.
- `json.dump()` writes a JSON object.
- `json.dumps()` writes a JSON to a string (i.e `json.dumpSTRING()`)

Since we want to output a JSON object, we have used `.dump()`