# JSON theroy

Here is an overview on how to create, read and write JSON files using Requests, REST APIs and JSON.

In [1]:
# extension to help clean python code
%load_ext lab_black

## 1. Packages to install

In [2]:
# import json package
import json

# import requests package
import requests

# import pandas software library
import pandas as pd

# normalize semi-structured JSON data into a flat table
from pandas import json_normalize

## 2. JSON strings

JSON (Javascript Object Notation) is the standard for exchanging data between the client (browser and the server. It is made of a set key/value paire to serialise objects in an easy way.

In [3]:
# create a random JSON string
json_string = """
{
 "batch_tag": "March 2023", 
 "start_date": {"month": "08"}, 
 "curriculum": {"Week1": "Introduction to DS", "Week2":"JSON theory"},  
 "students": [
             {"name": "Mik", "surname": "Flu"},
             {"name": "Michel", "surname": "Flumen"},
             {"name": "Michael", "surname": "Fluffy"}
             ]
}
"""

In [4]:
# check type
type(json_string)

str

In [5]:
# how the variable looks like (\n -> on a new line)
json_string

'\n{\n "batch_tag": "March 2023", \n "start_date": {"month": "08"}, \n "curriculum": {"Week1": "Introduction to DS", "Week2":"JSON theory"},  \n "students": [\n             {"name": "Mik", "surname": "Flu"},\n             {"name": "Michel", "surname": "Flumen"},\n             {"name": "Michael", "surname": "Fluffy"}\n             ]\n}\n'

## 3. Serialization and Deserialization

- Serialization -> convert data into a suitable format to be sent over the network
- Deserialization -> from network format to local data

### 3.1 Deserialization (receive reply from server to local)

In [6]:
# load json from strings
# deserialize
# get python dictionary
dict_from_string = json.loads(json_string)
dict_from_string

{'batch_tag': 'March 2023',
 'start_date': {'month': '08'},
 'curriculum': {'Week1': 'Introduction to DS', 'Week2': 'JSON theory'},
 'students': [{'name': 'Mik', 'surname': 'Flu'},
  {'name': 'Michel', 'surname': 'Flumen'},
  {'name': 'Michael', 'surname': 'Fluffy'}]}

In [7]:
# check type
type(dict_from_string)

dict

In [8]:
# get dictionary keys
dict_from_string.keys()

dict_keys(['batch_tag', 'start_date', 'curriculum', 'students'])

In [9]:
# get dictionary values
dict_from_string.values()

dict_values(['March 2023', {'month': '08'}, {'Week1': 'Introduction to DS', 'Week2': 'JSON theory'}, [{'name': 'Mik', 'surname': 'Flu'}, {'name': 'Michel', 'surname': 'Flumen'}, {'name': 'Michael', 'surname': 'Fluffy'}]])

In [10]:
# get tuple with keys and values
dict_from_string.items()

dict_items([('batch_tag', 'March 2023'), ('start_date', {'month': '08'}), ('curriculum', {'Week1': 'Introduction to DS', 'Week2': 'JSON theory'}), ('students', [{'name': 'Mik', 'surname': 'Flu'}, {'name': 'Michel', 'surname': 'Flumen'}, {'name': 'Michael', 'surname': 'Fluffy'}])])

Python JSON files are deserialized as dictionaries, however, 'key' should be always immutable (cannot be a list or another dictionary as they are mutable). 'value', on the other hand, can be almost everything.

### 3.2 Serialization (pass arguments to remote network)

In [11]:
# dump json into string
# serialize to JSON formatted str
# get JSON string representation of the python dictionary
new_json_string = json.dumps(dict_from_string)
new_json_string

'{"batch_tag": "March 2023", "start_date": {"month": "08"}, "curriculum": {"Week1": "Introduction to DS", "Week2": "JSON theory"}, "students": [{"name": "Mik", "surname": "Flu"}, {"name": "Michel", "surname": "Flumen"}, {"name": "Michael", "surname": "Fluffy"}]}'

In [12]:
# check type
type(new_json_string)

str

## 4. Write and read JSON files

### 4.1 Write (save)

In [13]:
# save (dump) string as json file (not a dictionary)
with open("introduction_to_ds", "w") as output:
    json.dump(dict_from_string, output)

# 'w' is writing
# 'r' is reading (default mode)
# dump does not keep it in memory

### 4.2 Read (open)

In [14]:
# open (load) as json file (as dictionary)
with open("introduction_to_ds") as json_file:
    data_json1 = json.load(
        json_file
    )  # different from .loads for a string, this is a json file

data_json1

{'batch_tag': 'March 2023',
 'start_date': {'month': '08'},
 'curriculum': {'Week1': 'Introduction to DS', 'Week2': 'JSON theory'},
 'students': [{'name': 'Mik', 'surname': 'Flu'},
  {'name': 'Michel', 'surname': 'Flumen'},
  {'name': 'Michael', 'surname': 'Fluffy'}]}

In [15]:
# another way to open (load) as json file
data_json2 = json.load(open("introduction_to_ds"))

data_json2

{'batch_tag': 'March 2023',
 'start_date': {'month': '08'},
 'curriculum': {'Week1': 'Introduction to DS', 'Week2': 'JSON theory'},
 'students': [{'name': 'Mik', 'surname': 'Flu'},
  {'name': 'Michel', 'surname': 'Flumen'},
  {'name': 'Michael', 'surname': 'Fluffy'}]}

In [16]:
type(data_json1), type(data_json2)

(dict, dict)

## 5. Read JSON from URL

To load a .json from a URL just use 'requests' library and the 'get' function to send a HTTPS request to the URL. Once received, deserialize the attribute to get the python dictionary.

In [17]:
# url
url = "http://data.nba.net/prod/v1/20221026/0022200056_boxscore.json"

In [18]:
# get function using requests package
response = requests.get(url)

In [19]:
# check for variable content
response.content

b'{"_internal":{"pubDateTime":"2022-10-12 16:41:46.979 EDT","igorPath":"domUpdater,1665607306006,1665607306822|feedProducer,1665607306886,1665607306991","routeName":"schedule_sync","routeValue":"2022_0022200056_20221026_schedule_sync","xslt":"NBA/xsl/game/boxscore/marty_boxscore.xsl","xsltForceRecompile":"true","xsltInCache":"false","xsltCompileTimeMillis":"79","xsltTransformTimeMillis":"12","consolidatedDomKey":"prod__transform__marty_boxscore__1662527206251","endToEndTimeMillis":"985"},"basicGameData":{"seasonStageId":2,"seasonYear":"2022","leagueName":"standard","gameId":"0022200056","arena":{"name":"Rocket Mortgage FieldHouse","isDomestic":true,"city":"Cleveland","stateAbbr":"OH","country":"USA"},"isGameActivated":false,"statusNum":1,"extendedStatusNum":0,"startTimeEastern":"7:00 PM ET","startTimeUTC":"2022-10-26T23:00:00.000Z","startDateEastern":"20221026","homeStartDate":"20221026","homeStartTime":"1900","visitorStartDate":"20221026","visitorStartTime":"1900","gameUrlCode":"20221

In [20]:
# deserialize (json to dictionary conversion)
nba_json = response.json()

In [21]:
# check type
type(nba_json)

dict

In [22]:
nba_json

{'_internal': {'pubDateTime': '2022-10-12 16:41:46.979 EDT',
  'igorPath': 'domUpdater,1665607306006,1665607306822|feedProducer,1665607306886,1665607306991',
  'routeName': 'schedule_sync',
  'routeValue': '2022_0022200056_20221026_schedule_sync',
  'xslt': 'NBA/xsl/game/boxscore/marty_boxscore.xsl',
  'xsltForceRecompile': 'true',
  'xsltInCache': 'false',
  'xsltCompileTimeMillis': '79',
  'xsltTransformTimeMillis': '12',
  'consolidatedDomKey': 'prod__transform__marty_boxscore__1662527206251',
  'endToEndTimeMillis': '985'},
 'basicGameData': {'seasonStageId': 2,
  'seasonYear': '2022',
  'leagueName': 'standard',
  'gameId': '0022200056',
  'arena': {'name': 'Rocket Mortgage FieldHouse',
   'isDomestic': True,
   'city': 'Cleveland',
   'stateAbbr': 'OH',
   'country': 'USA'},
  'isGameActivated': False,
  'statusNum': 1,
  'extendedStatusNum': 0,
  'startTimeEastern': '7:00 PM ET',
  'startTimeUTC': '2022-10-26T23:00:00.000Z',
  'startDateEastern': '20221026',
  'homeStartDate': '

## 6. Explore deserialized JSON strings (= dictionaries)

In [23]:
# get request and deserialize all at once
nba_json = requests.get(url).json()

In [24]:
# tuple with key values
nba_json.items()

dict_items([('_internal', {'pubDateTime': '2022-10-12 16:41:46.979 EDT', 'igorPath': 'domUpdater,1665607306006,1665607306822|feedProducer,1665607306886,1665607306991', 'routeName': 'schedule_sync', 'routeValue': '2022_0022200056_20221026_schedule_sync', 'xslt': 'NBA/xsl/game/boxscore/marty_boxscore.xsl', 'xsltForceRecompile': 'true', 'xsltInCache': 'false', 'xsltCompileTimeMillis': '79', 'xsltTransformTimeMillis': '12', 'consolidatedDomKey': 'prod__transform__marty_boxscore__1662527206251', 'endToEndTimeMillis': '985'}), ('basicGameData', {'seasonStageId': 2, 'seasonYear': '2022', 'leagueName': 'standard', 'gameId': '0022200056', 'arena': {'name': 'Rocket Mortgage FieldHouse', 'isDomestic': True, 'city': 'Cleveland', 'stateAbbr': 'OH', 'country': 'USA'}, 'isGameActivated': False, 'statusNum': 1, 'extendedStatusNum': 0, 'startTimeEastern': '7:00 PM ET', 'startTimeUTC': '2022-10-26T23:00:00.000Z', 'startDateEastern': '20221026', 'homeStartDate': '20221026', 'homeStartTime': '1900', 'visi

In [25]:
# get keys only
nba_json.keys()

dict_keys(['_internal', 'basicGameData', 'previousMatchup'])

In [26]:
# get values only
nba_json.values()

dict_values([{'pubDateTime': '2022-10-12 16:41:46.979 EDT', 'igorPath': 'domUpdater,1665607306006,1665607306822|feedProducer,1665607306886,1665607306991', 'routeName': 'schedule_sync', 'routeValue': '2022_0022200056_20221026_schedule_sync', 'xslt': 'NBA/xsl/game/boxscore/marty_boxscore.xsl', 'xsltForceRecompile': 'true', 'xsltInCache': 'false', 'xsltCompileTimeMillis': '79', 'xsltTransformTimeMillis': '12', 'consolidatedDomKey': 'prod__transform__marty_boxscore__1662527206251', 'endToEndTimeMillis': '985'}, {'seasonStageId': 2, 'seasonYear': '2022', 'leagueName': 'standard', 'gameId': '0022200056', 'arena': {'name': 'Rocket Mortgage FieldHouse', 'isDomestic': True, 'city': 'Cleveland', 'stateAbbr': 'OH', 'country': 'USA'}, 'isGameActivated': False, 'statusNum': 1, 'extendedStatusNum': 0, 'startTimeEastern': '7:00 PM ET', 'startTimeUTC': '2022-10-26T23:00:00.000Z', 'startDateEastern': '20221026', 'homeStartDate': '20221026', 'homeStartTime': '1900', 'visitorStartDate': '20221026', 'visi

In [27]:
# check first key content
nba_json["_internal"]

{'pubDateTime': '2022-10-12 16:41:46.979 EDT',
 'igorPath': 'domUpdater,1665607306006,1665607306822|feedProducer,1665607306886,1665607306991',
 'routeName': 'schedule_sync',
 'routeValue': '2022_0022200056_20221026_schedule_sync',
 'xslt': 'NBA/xsl/game/boxscore/marty_boxscore.xsl',
 'xsltForceRecompile': 'true',
 'xsltInCache': 'false',
 'xsltCompileTimeMillis': '79',
 'xsltTransformTimeMillis': '12',
 'consolidatedDomKey': 'prod__transform__marty_boxscore__1662527206251',
 'endToEndTimeMillis': '985'}

In [28]:
# check second key content
nba_json["basicGameData"]

{'seasonStageId': 2,
 'seasonYear': '2022',
 'leagueName': 'standard',
 'gameId': '0022200056',
 'arena': {'name': 'Rocket Mortgage FieldHouse',
  'isDomestic': True,
  'city': 'Cleveland',
  'stateAbbr': 'OH',
  'country': 'USA'},
 'isGameActivated': False,
 'statusNum': 1,
 'extendedStatusNum': 0,
 'startTimeEastern': '7:00 PM ET',
 'startTimeUTC': '2022-10-26T23:00:00.000Z',
 'startDateEastern': '20221026',
 'homeStartDate': '20221026',
 'homeStartTime': '1900',
 'visitorStartDate': '20221026',
 'visitorStartTime': '1900',
 'gameUrlCode': '20221026/ORLCLE',
 'clock': '',
 'isBuzzerBeater': False,
 'isPreviewArticleAvail': False,
 'isRecapArticleAvail': False,
 'nugget': {'text': ''},
 'attendance': '',
 'tickets': {'mobileApp': 'https://a.data.nba.com/tickets/single/2022/0022200056/APP_TIX',
  'desktopWeb': 'https://a.data.nba.com/tickets/single/2022/0022200056/TEAM_SCH',
  'mobileWeb': 'https://a.data.nba.com/tickets/single/2022/0022200056/WEB_MWEB',
  'leagGameInfo': 'https://a.da

In [29]:
# check third key content
nba_json["previousMatchup"]

{'gameId': '0022101179', 'gameDate': '20220405'}

In [30]:
# further exploration of 'basicGameData' key
nba_json["basicGameData"].keys()

dict_keys(['seasonStageId', 'seasonYear', 'leagueName', 'gameId', 'arena', 'isGameActivated', 'statusNum', 'extendedStatusNum', 'startTimeEastern', 'startTimeUTC', 'startDateEastern', 'homeStartDate', 'homeStartTime', 'visitorStartDate', 'visitorStartTime', 'gameUrlCode', 'clock', 'isBuzzerBeater', 'isPreviewArticleAvail', 'isRecapArticleAvail', 'nugget', 'attendance', 'tickets', 'hasGameBookPdf', 'isStartTimeTBD', 'isNeutralVenue', 'gameDuration', 'period', 'vTeam', 'hTeam', 'watch', 'officials'])

In [31]:
# access 'gameId' from 'basicGameData'
nba_json["basicGameData"]["gameId"]

'0022200056'

In [32]:
# further exploration of 'previousMatchup' key
nba_json["previousMatchup"].keys()

dict_keys(['gameId', 'gameDate'])

In [33]:
# access 'gameDate' from 'previousMatchup'
nba_json["previousMatchup"]["gameDate"]

'20220405'

In [34]:
# loop through items without indexing
for key, value in nba_json["previousMatchup"].items():
    print(f"{key}")

gameId
gameDate


In [35]:
# look through items with indexing -> use enumerate
for ind, (key, value) in enumerate(nba_json["previousMatchup"].items()):
    print("%s - %s" % (ind, key))

0 - gameId
1 - gameDate


## 7. Normalize JSON data into a flat table

Once the relevant data has been found, normalize data via json_normalize into a flat table.

In [36]:
# from 'previousMatchup' from nba_json
# each key represents a column (one column for gameId and one for gameDate)
df = json_normalize(nba_json["previousMatchup"]).head()

df

Unnamed: 0,gameId,gameDate
0,22101179,20220405
