In [None]:
# Housekeeping - This cell shortens the traceback of error messages
# unless you're on mybinder.org 

import json 
from pprint import pprint
import sys
ipython = get_ipython()
unhide_traceback = None

def hide_traceback(exc_tuple=None, filename=None, tb_offset=None,
                   exception_only=False, running_compiled_code=False):
    etype, value, tb = sys.exc_info()
    return ipython._showtraceback(etype, value, ipython.InteractiveTB.get_exception_only(etype, value))

if not unhide_traceback:
    unhide_traceback = ipython.showtraceback

ipython.showtraceback = hide_traceback   # or:  ipython.showtraceback = unhide_traceback



Pythoncamp 2020 Session Martin Borus, Twitter: @mborus

Translated - Session originally done in German

# GLOM

by Mahmoud Hashemi

## "If you have nested data, you need Glom!"

https://github.com/mahmoud/glom 





# Main features of Glom

- Path based access
- Declarative data transformation 
- Easy to read helpful error messages
- comes with debugging features


## Preparation for this notebook

pip install glom  # current version 20.5.0

pip install requests

Full docs and tutorial at https://glom.readthedocs.io,
some examples from this tutorial are included here

# 1. Normal Python

These are some examples on how to access nested data with regular Python

In [None]:
data = {'a': {'b': {'c': 'd'}}}

In [None]:
data['a']['b']['c']

In [None]:
data2 = {'a': {'b': None}}

In [None]:
data2['a']['b']['c']

# 2. Glom 

Here's how you do the same thing in Glom, using the data above

In [None]:
from glom import glom

In [None]:
glom(data, 'a.b.c')

In [None]:
# this here will product an error message
glom(data2, 'a.b.c')


In [None]:
# Glom is great for catching errors

from glom import GlomError, PathAccessError

try:
    glom(data2, 'a.b.c')
except PathAccessError as e:
    print(e)

In [None]:
try:
    glom(data2, 'a.b.c')
except AttributeError as e:
    print(e)

In [None]:
try:
    glom(data2, 'a.b.c')
except GlomError as e:
    print(e)

## You can use Glom with lists

In [None]:
data = [1, [2 , 3, 4] , 3, 4 ,5]

In [None]:
glom(data, '1.0')

## You can use Glom with objects !1!!

In [None]:
class MyClass:
    def __init__(self):
        self.my_hallo = "hello!"
        self.my_world = "world!"
       
myvar = MyClass()        

In [None]:
glom(myvar, 'my_world')

# 3. Target & Spec
- the "Target" is the data, (list, dict, object)
- the "Spec" is the wanted result

In [None]:
target = {
     'galaxy': {
        'system': {
            'planet': 'jupiter'
         }
    }
}

spec = 'galaxy.system.planet'

glom(target, spec)

In [None]:
target = {
    'system': {
        'planets': [
            {'name': 'earth', 'moons': 1},
            {'name': 'jupiter', 'moons': 69}
        ]
    }
}

spec = {
     'names': ('system.planets', ['name']),
     'moons': ('system.planets', ['moons'])
}

glom(target, spec)

In [None]:
target = {
     'system': {
         'planets': [
            {
                'name': 'earth',
                'moons': [
                    {'name': 'luna'}
                ]
            },
            {
                'name': 'jupiter',
                'moons': [
                    {'name': 'io'},
                    {'name': 'europa'}
                ]
            }
        ]
    }
}

In [None]:

spec = {
    'planet_names': ('system.planets', ['name']),
    'moon_names': ('system.planets', [('moons', ['name'],  )])
}
pprint(glom(target, spec))

In [None]:
# Coalesce: Try the first, if it fails, try the next, and so on....

from glom import Coalesce

target = {
     'system': {
         'planets': [
             {'name': 'earth', 'moons': 1},
             {'name': 'jupiter', 'moons': 69}
         ]
     }
}

spec = {
     'planets': (Coalesce('system.planets', 'system.dwarf_planets'), ['name']),
     'moons': (Coalesce('system.planets', 'system.dwarf_planets'), ['moons'])
}

glom(target, spec)

In [None]:
target = {
     'system': {
         'dwarf_planets': [
             {'name': 'pluto', 'moons': 5},
             {'name': 'ceres', 'moons': 0},
         ]
     }
 }
glom(target, spec)

In [None]:
# use python functions or lambdas inside the code

target = {
     'system': {
         'planets': [
             {'name': 'earth', 'moons': 1},
             {'name': 'jupiter', 'moons': 69}
         ]
     }
}

print(glom(target, {'moon_count': ('system.planets', ['moons'], sum)}))
    
print(glom(target, {'moon_count': ('system.planets', ['moons'], lambda x: sum(x))}))

In [None]:
# glom a class, subclass structure

class MySubClass:
    def __init__(self):
        self.my_hey = "Hey!"

class MyClass:
    def __init__(self):
        self.my_hello = "hello!"
        self.my_world = "world!"
        self.my_heylist = [MySubClass()] * 6

        
myvar = MyClass()  

In [None]:
from glom import Iter
spec = {'hello': 'my_hello', 'world': 'my_world', 'heylist': ('my_heylist', ['my_hey'])}

glom(myvar, spec)

# 4. Flatten and merge data

Convert a list of lists & list of dicts to a list or dict

In [None]:
from glom import Flatten, Merge

In [None]:
data = [[1, 2], [3], [4], [], [5]]

In [None]:
glom(data, Flatten())

In [None]:
data = [{'hello': 'world'}, {'hello2': 'world2'}]

In [None]:
glom(data, Merge())

# 5. from the tutorial: Convert objects into output.

This shows a more complicated example on how nested class objects are converted.
Read the official tutorial on what happens here.

In [None]:
from glom.tutorial import * 

In [None]:
contact = Contact('Julian',
          emails=[Email(email='jlahey@svtp.info')],
                  location='Canada')

In [None]:
contact.save()

In [None]:
contact.primary_email

In [None]:
contact.add_date

In [None]:
contact.id

In [None]:
len(Contact.objects.all())

In [None]:
Contact.objects.all()

In [None]:
# without Glom: You can't dump the object to json

json.dumps(Contact.objects.all())

In [None]:
target = Contact.objects.all()

In [None]:
# have a look at the data type of the "add_date" - this is not dumpable
target[0].add_date

In [None]:
# Note: In this spec, the datetime and integers are converted to string.

spec = {'results': [{'id': 'id',
                      'name': 'name',
                      'add_date': ('add_date', str),
                      'emails': ('emails', [{'id': 'id',
                                            'email': 'email',
                                            'type': 'email_type'}]),
                      'primary_email': Coalesce('primary_email.email', default=None),
                      'pref_name': Coalesce('pref_name', 'name', skip='', default=''),
                      'detail': Coalesce('company',
                                         'location',
                                         ('add_date.year', str),
                                         skip='', default='')}]}

In [None]:
# with Glom: convert your target to somethin you can dump

resp = glom(target, spec)
print(json.dumps(resp, indent=2, sort_keys=True))

In [None]:
# This is the Flatten command from the previous section, used to get all emails in the result

from glom import Flatten

glom(resp, ('results', ['emails'],
             Flatten(),
             ['email'],
           )
    )


# 6. "T" - the Stunt-Double

T in the spect is a stand in for anything at that position in the spec.
It behaves like the python object it matches. 

In the first example,
it behaves like a list, so T[0] is the first list object

In [None]:
from glom import T

In [None]:
from glom import T, Flatten
# from the data structure above, show the first entry
glom(resp, ('results', T[0]))

T also works with Objects. Very nice for Namedtuples.



In [None]:
from collections import namedtuple
HelloWorld = namedtuple('HelloWord','hello,world')
hw = HelloWorld('hello!', 'world!')

In [None]:
glom(hw, T.hello)

In [None]:
glom(hw, (T._asdict(), 'hello'))

# 7. Let's loop with Iter

In [None]:
# let's prepare a list of the Seven Dwarfs

from glom import glom, Iter

target = [
    'Happy',
    'Sneezy',
    'Sleepy',
    'Doc',
    'Bashful',
    'Grumpy',
    'Dopey'
]

In [None]:
# Iter returns a generator type
spec = Iter()
glom(target, spec)

In [None]:
# Get the first dwarf only
spec = Iter().first()
glom(target, spec)

In [None]:
# Convert the generator into a list with ".all"
spec = Iter().all()
glom(target, spec)

In [None]:
# Chunk the group in teams of 2
spec = Iter().chunked(2, fill='Snow White').all()
glom(target, spec)

In [None]:
# Find any dwarf that's not doc with a lambda
spec = Iter().filter(lambda x: x != 'Doc').all()
glom(target, spec)

In [None]:
# Hey, 3 dwarfs maximum!
spec = Iter().limit(3).all()
glom(target, spec)

In [None]:
# Find dwarfs 2 - 4 in this 0-based list
spec = Iter().slice(1, 4).all()
glom(target, spec)

In [None]:
# Only show dwarfs with unique first letters in the names
spec = Iter().unique(T[0]).all()
glom(target, spec)

In [None]:
# Take dwarfs until Doc arrives
spec = Iter().takewhile(lambda x: x != 'Doc').all()
glom(target, spec)

In [None]:
# Show the first dwarf following Doc
spec = Iter().dropwhile(lambda x: x != 'Chef').slice(1, 2).first()
glom(target, spec)

In [None]:
# Split the group around Doc
spec = Iter().split('Doc').all() 
glom(target, spec)

In [None]:
# Use a function on all
spec = Iter().map(lambda x:x.lower()).all()
glom(target, spec)


In [None]:
# or instead of lambda, the stunt double T
spec = Iter().map(T.upper()).all()
glom(target, spec)

# 8. Literal - assign a fixed value

In [None]:
from glom import Literal
spec = Iter({'Name': T, 'Size': Literal('Dwarflike')}).all()

In [None]:
glom(target, spec)

# 9. Data Driven

What to do, if a dictionary key has the data

In [None]:
from glom import glom, T, Merge, Iter, Coalesce

target = {
    "pluto": {"moons": 6, "population": None},
    "venus": {"population": {"aliens": 5}},
    "earth": {"moons": 1, "population": {"humans": 7_700_000_000, "aliens": 1}},
}

spec = {
    "moons": (
           T.items(),
           Iter({T[0]: (T[1], Coalesce("moons", default=0))}),
           Merge()
    )
}
        
glom(target, spec)        

# 10. Add and remove values


In [None]:
data = {'moons': {'pluto': 6, 'venus': 0, 'earth': 1}}

In [None]:
from glom import Assign, Delete
spec = Assign('moons.saturn', 7)

In [None]:
glom(data, spec)

In [None]:
spec = Delete('moons.earth')

In [None]:
glom(data, spec)

In [None]:
spec = Delete('moons.mars', ignore_missing=True)

glom(data, spec)


# 11. Scope / Let

Scope allows you to collect data which would otherwise not be accessable from within a spec.

This is nice if you want to move data inside each element to get a flat structure you can import into a pandas Dataframe

In [None]:
from glom import S, glom, Assign, Spec

target = {'date': '2020-04-01',
 'location': 'A',
 'items': [
     {'name': 'A', 'id': 'A1'},
     {'name': 'B', 'id': 'B1'},
     {'name': 'C', 'id': 'C1'}
]}

spec = ('items', 
        [Assign('date', Spec(S['date']))], 
        [Assign('location', Spec(S['location']))]
       )

glom(target, spec, scope=target)

The same example with let:
Let writes to the scope at runtime.


In [None]:
from glom import Fill
from glom.core import Let

spec = (
    # Write outer value to scope
    Let(base={"date": "date", "location": "location"}),
    # select just the items
    "items",
    [
        # for every element: add base to element
        (Fill([T, S["base"]]), Merge())
    ]
)

glom(target, spec, scope=target)


# 12. XML

Simple example on how to glom XML

In [None]:
from glom import Ref

In [None]:
etree2dicts = Ref('ElementTree',
    {"tag": "tag", 
     "text": "text", 
     "attrib": "attrib", 
     "children": (iter, [Ref('ElementTree')])})

In [None]:
html_text = """<html>
  <head>
    <title>the title</title>
  </head>
  <body id="the-body">
    <p>A paragraph</p>
  </body>
</html>"""

In [None]:
from xml.etree import ElementTree
etree = ElementTree.fromstring(html_text)

In [None]:
glom(etree, etree2dicts)

# 13. Out of time...

other things worth taking a look at.

- Path (A special Path notation for edge cases, where strings don't work)
- Invoke / Call (Use a function)
- Check -> CheckError (Check for Data consistency!)
- Inspect (Debugging helper: Print out whats visible inside the spec)  




# 14. Glom, Comments and the code formatter "Black"

If you use Black on your code (https://pypi.org/project/black/), black may
make your glom spec a lot harder to read.

If your spec is somewhat complicated, add comments to it.

The above used example still works with comments inside

    spec = (
        # Write outer value to scope
        Let(base={"date": "date", "location": "location"}),

        # select just the items
        "items",
        [
            # for every element: add base to element
            (Fill([T, S["base"]]), Merge())
        ]
    )

Without comments, Black will try to save space.

    spec = (
        Let(base={"date": "date", "location": "location"}),
        "items",
        [(Fill([T, S["base"]]), Merge())],
    )



# Real-world-like practice exercises

Note: These real two examples are somewhat difficult and show real world use cases where I 
used glom in production. You can solve them with everything shown in this tutorial.

If you have problems, try a partial solution first and work towards the final result. 

A working example is provided for each exercise, not necessary the best way to do it...

### Exercise 1:

Extract data from an online booking result
http://borus.de/pythoncamp/booking_example.json
        
Wanted result:

    {'departure_harbor_outward_trip': 'DKHNB',
     'departure_date_outward_trip': '2019-09-01',
     'departure_time_outward_trip': '14:30',
     'departure_harbor_return_trip': 'DELIS',
     'departure_date_return_trip': '2019-10-02',
     'departure_time_return_trip': '19:25',
     'tickets': ['CAR', 'AD', 'CH']
    }


In [None]:
r = requests.get(r'http://borus.de/pythoncamp/booking_example.json')
r.json()


In [None]:
# write the spec here

spec = "...???"

In [None]:
glom(r.json(), spec)


*For a solution, change this cell to "code" and run the code below.*

import base64;print(base64.b64decode(b'CnNwZWMgPSB7CiAgICAgICdkZXBhcnR1cmVfaGFyYm9yX291dHdhcmRfdHJpcCc6ICgndHJpcHMnLCAKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgSXRlcigpLmZpbHRlcihsYW1iZGEgeDogeFsndHJpcF9wYXJ0J10gPT0gJ09VVCcpLmZpcnN0KCksCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgJ2RlcGFydHVyZV9oYXJib3InKSwKICAgICAgICdkZXBhcnR1cmVfZGF0ZV9vdXR3YXJkX3RyaXAnOiAoJ3RyaXBzJywgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIEl0ZXIoKS5maWx0ZXIobGFtYmRhIHg6IHhbJ3RyaXBfcGFydCddID09ICdPVVQnKS5maXJzdCgpLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICdkYXRlJyksCiAgICAgICAnZGVwYXJ0dXJlX3RpbWVfb3V0d2FyZF90cmlwJzogKCd0cmlwcycsIAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBJdGVyKCkuZmlsdGVyKGxhbWJkYSB4OiB4Wyd0cmlwX3BhcnQnXSA9PSAnT1VUJykuZmlyc3QoKSwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAndGltZScpLAogICAgICAgJ2RlcGFydHVyZV9oYXJib3JfcmV0dXJuX3RyaXAnOiAoJ3RyaXBzJywgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIEl0ZXIoKS5maWx0ZXIobGFtYmRhIHg6IHhbJ3RyaXBfcGFydCddID09ICdSRVQnKS5maXJzdCgpLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICdkZXBhcnR1cmVfaGFyYm9yJyksCiAgICAgICAnZGVwYXJ0dXJlX2RhdGVfcmV0dXJuX3RyaXAnOiAoJ3RyaXBzJywgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIEl0ZXIoKS5maWx0ZXIobGFtYmRhIHg6IHhbJ3RyaXBfcGFydCddID09ICdSRVQnKS5maXJzdCgpLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICdkYXRlJyksCiAgICAgICAnZGVwYXJ0dXJlX3RpbWVfcmV0dXJuX3RyaXAnOiAoJ3RyaXBzJywgCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIEl0ZXIoKS5maWx0ZXIobGFtYmRhIHg6IHhbJ3RyaXBfcGFydCddID09ICdSRVQnKS5maXJzdCgpLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICd0aW1lJyksCiAgICAgICAndGlja2V0cyc6ICAoJ3RpY2tldHMnLCBJdGVyKCd0eXBlJykuYWxsKCkpCiAgICAgICB9Cg==').decode('utf-8'))




### Exercise 2: 

Take the pythoncamp room list at http://borus.de/pythoncamp/event.json Create a dictionary that has all room names in capital letters as a key and the url as a value,.

Wanted result

    {
     'BERLIN': 'https://bbb01.pythoncamp.online/b/rei-hyz-cgm',
     'FLIEGENDER ZIRKUS': 'https://bbb01.pythoncamp.online/b/rei-hyz-cgm',
     'TOKIO': 'https://bbb01.pythoncamp.online/b/rei-hyz-cgm'
     ...
     }

In [None]:
import requests

In [None]:
r = requests.get(r'http://borus.de/pythoncamp/event.json')

In [None]:
r.json()

In [None]:
# write spec here
spec = "... ???"


In [None]:
glom(r.json(), spec)

*For a solution, change this cell to "code" and run the code below.*

import base64;print(base64.b64decode(b'c3BlYz0gKCdyb29tcycsIAogICAgICAgICAgIChJdGVyKHtUWyduYW1lJ10udXBwZXIoKTogVFsndXJsJ119KSwKICAgICAgICAgICBNZXJnZSgpICAgICAgCiAgICAgICAgICAgKQogICAgICAp'
).decode('utf-8'))

In [None]:
# Tool - Exercise hider - Encodes a solution
import base64
solution = """my solution""".encode('utf-8')
print(base64.b64encode((solution)))