# Visualizing the JSON Input Data

In [1]:
import pandas as pd
import boto3
import json
import configparser

In [2]:
config = configparser.ConfigParser()
config.read_file(open('dwh.cfg'))

## Create an S3 ressource 

In [3]:
s3_ressource = boto3.resource('s3',
                       region_name='eu-west-3',
                       aws_access_key_id=config.get("AWS", "KEY"),
                       aws_secret_access_key=config.get("AWS", "SECRET")
                   )
s3_client = boto3.client('s3',
                       region_name='eu-west-3',
                       aws_access_key_id=config.get("AWS", "KEY"),
                       aws_secret_access_key=config.get("AWS", "SECRET")
                   )

## Check out the sample data sources on S3
Sanity check

In [4]:
sampleDbBucket =  s3_ressource.Bucket("awssampledbuswest2")
for obj in list(sampleDbBucket.objects.filter(Prefix="ssbgz"))[:3]:
    print(obj)

s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/customer0002_part_00.gz')
s3.ObjectSummary(bucket_name='awssampledbuswest2', key='ssbgz/dwdate.tbl.gz')


## Check out the S3 bucket for the project

In [5]:
bucket_name = "udacity-dend"
dend_bucket =  s3_ressource.Bucket(bucket_name)
all_objects = list(dend_bucket.objects.filter(Prefix='log_data/'))
print(len(all_objects))
for o in all_objects[:3]:
    print(o)

31
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-01-events.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='log_data/2018/11/2018-11-02-events.json')


### Visualize one EVENT JSON file

#### First method: reading the file
This method will read the object as a string (file_content)

In [6]:
obj = s3_client.get_object(Key='log_data/2018/11/2018-11-01-events.json', Bucket=bucket_name)
file_content = obj['Body'].read().decode('utf-8')
print(file_content[:1000])

{"artist":null,"auth":"Logged In","firstName":"Walter","gender":"M","itemInSession":0,"lastName":"Frye","length":null,"level":"free","location":"San Francisco-Oakland-Hayward, CA","method":"GET","page":"Home","registration":1540919166796.0,"sessionId":38,"song":null,"status":200,"ts":1541105830796,"userAgent":"\"Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/36.0.1985.143 Safari\/537.36\"","userId":"39"}
{"artist":null,"auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":0,"lastName":"Summers","length":null,"level":"free","location":"Phoenix-Mesa-Scottsdale, AZ","method":"GET","page":"Home","registration":1540344794796.0,"sessionId":139,"song":null,"status":200,"ts":1541106106796,"userAgent":"\"Mozilla\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\/537.36 (KHTML, like Gecko) Chrome\/35.0.1916.153 Safari\/537.36\"","userId":"8"}
{"artist":"Des'ree","auth":"Logged In","firstName":"Kaylee","gender":"F","itemInSession":1,"lastName

#### Failure:

But although the file has a json extension, reading with JSON will fail, because the file is not one JSON object, but rather each line is a JSON object.    
The Format is JSONL (JSON Lines).    
More explication Here:     
http://jsonlines.org/    
https://hackernoon.com/json-lines-format-76353b4e588d
- Since every entry in JSON Lines is a valid JSON it can be parsed/unmarshaled as a standalone JSON document. For example, you can seek within it, split a 10gb file into smaller files without parsing the entire thing.
- No need do read the whole file in memory before parse.
- You can easily add further lines to the file by simply appending to the file. If the entire file were a JSON array then you would have to parse it, add the new line, and then convert back to JSON.

##### JSON:
[{"artist":"foo"}, {"artist":"bar}]    

##### JSONL:
{"artist": "foo"}\n    
{"artist": "bar"}\n

##### Alternative method to download the file and open it
From https://stackoverflow.com/questions/12451431/loading-and-parsing-a-json-file-with-multiple-json-objects     
Reading a JSON Lines file can be achieved by:
- Installing a package https://pypi.org/project/json-lines/
- Or: (see below):

In [7]:
filename = 'downloaded_file.json'
try:
    s3_ressource.Bucket(bucket_name).download_file('log_data/2018/11/2018-11-01-events.json', filename)
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == "404":
        print("The object does not exist.")
    else:
        raise
data = []
with open(filename) as f:
    for line in f:
        data.append(json.loads(line))

In [8]:
data[6]

{'artist': 'The Mars Volta',
 'auth': 'Logged In',
 'firstName': 'Kaylee',
 'gender': 'F',
 'itemInSession': 5,
 'lastName': 'Summers',
 'length': 380.42077,
 'level': 'free',
 'location': 'Phoenix-Mesa-Scottsdale, AZ',
 'method': 'PUT',
 'page': 'NextSong',
 'registration': 1540344794796.0,
 'sessionId': 139,
 'song': 'Eriatarka',
 'status': 200,
 'ts': 1541106673796,
 'userAgent': '"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"',
 'userId': '8'}

## Reading the JSON Path file
The JSON Path  is used for selecting and extracting a JSON document’s property values.    
It allows to query subsections of a JSON document.    
In this case, it is used to help read the JSON and output a constant structure

In [9]:
obj = s3_client.get_object(Key='log_json_path.json', Bucket=bucket_name)
file_content = obj['Body'].read().decode('utf-8')
print(file_content)

{
    "jsonpaths": [
        "$['artist']",
        "$['auth']",
        "$['firstName']",
        "$['gender']",
        "$['itemInSession']",
        "$['lastName']",
        "$['length']",
        "$['level']",
        "$['location']",
        "$['method']",
        "$['page']",
        "$['registration']",
        "$['sessionId']",
        "$['song']",
        "$['status']",
        "$['ts']",
        "$['userAgent']",
        "$['userId']"
    ]
}


### Reading the song json file

In [16]:
bucket_name = "udacity-dend"
dend_bucket =  s3_ressource.Bucket(bucket_name)
all_objects = list(dend_bucket.objects.filter(Prefix='song_data/A/A'))
print(len(all_objects))
for o in all_objects[:3]:
    print(o)

604
s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAAAK128F9318786.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAAAV128F421A322.json')
s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAABD128F429CF47.json')


In [11]:
obj = s3_client.get_object(Key='song_data/A/A/A/TRAAAAK128F9318786.json', Bucket=bucket_name)
file_content = obj['Body'].read().decode('utf-8')
print(file_content)

{"artist_id":"ARJNIUY12298900C91","artist_latitude":null,"artist_location":"","artist_longitude":null,"artist_name":"Adelitas Way","duration":213.9424,"num_songs":1,"song_id":"SOBLFFE12AF72AA5BA","title":"Scream","year":2009}


In [12]:
all_objects[:10]

[s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/'),
 s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAAAK128F9318786.json'),
 s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAAAV128F421A322.json'),
 s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAABD128F429CF47.json'),
 s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAACN128F9355673.json'),
 s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAAEA128F935A30D.json'),
 s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAAED128E0783FAB.json'),
 s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAAEM128F93347B9.json'),
 s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAAEW128F42930C0.json'),
 s3.ObjectSummary(bucket_name='udacity-dend', key='song_data/A/A/A/TRAAAFD128F92F423A.json')]