# Data Insight

In [1]:
from functools import reduce
from typing import List, Callable
import json
import os

In [2]:
def show_length_stat(prop_name_list: List[str], data_set: List[dict]) -> None:
    print(f'{"prop_name".ljust(18)}{"prop_len".rjust(10)}{"prop_cnt".rjust(20)}{"prop_mean".rjust(20)}{"prop_max_len".rjust(20)}{"prop_min_len".rjust(20)}')
    for prop_name in prop_name_list:
        prop_len = reduce(lambda a, b: a + b.get(prop_name, 0) if b.get(prop_name, 0) != -1 else a, data_set, 0)
        prop_cnt = reduce(lambda a, b: a + 1 if b.get(prop_name, 0) != -1 else a, data_set, 0)
        prop_mean = prop_len//prop_cnt
        prop_max_len = max([x[prop_name] for x in data_set if x[prop_name] != -1])
        prop_min_len = min([x[prop_name] for x in data_set if x[prop_name] != -1])
        print(f'{prop_name.ljust(18)}{str(prop_len).rjust(10)}{str(prop_cnt).rjust(20)}{str(prop_mean).rjust(20)}{str(prop_max_len).rjust(20)}{str(prop_min_len).rjust(20)}')

## Song Data

In [3]:
def get_prop_name_list_song(file_path: str) -> List[str]:
    with open(file_path, 'r') as f:
        data = json.loads(f.read())
        return list(data.keys())

In [4]:
def get_data_set_song(data_src_dir: str) -> List[dict]:
    data_set = []
    for path, _, files in os.walk(data_src_dir):
        for file in files:
            with open(os.path.join(path, file), 'r') as f:
                data_set.append(json.loads(f.read()))
    return data_set

In [5]:
def parse_song_data(data: dict) -> dict:
    parsed = {}
    for key in data.keys():
        if key == 'year':
            parsed[key] = len(str(data[key])) if data[key] is not None and len(str(data[key])) == 4 else -1
        else:
            parsed[key] = len(str(data[key])) if data[key] else -1
    return parsed

In [7]:
# get the property names in song_data
prop_name_list_song = get_prop_name_list_song('data/song_data/A/A/A/TRAAAAW128F429D538.json')

In [9]:
# get data set of song data
data_set_song = get_data_set_song('data/song_data')

In [10]:
# parse song data to get lenght of each property.
parsed_data_set_song = list(map(parse_song_data, data_set_song))

In [11]:
show_length_stat(prop_name_list_song, parsed_data_set_song)

prop_name           prop_len            prop_cnt           prop_mean        prop_max_len        prop_min_len
num_songs                 71                  71                   1                   1                   1
artist_id               1278                  71                  18                  18                  18
artist_latitude          243                  31                   7                   8                   6
artist_longitude         273                  31                   8                  10                   7
artist_location          563                  43                  13                  29                   4
artist_name              953                  71                  13                  94                   3
song_id                 1278                  71                  18                  18                  18
title                   1435                  71                  20                  52                   5
duration           

## Log Data

In [12]:
def get_prop_name_list_log(file_path: str) -> List[str]:
    with open(file_path, 'r') as f:
        data = json.loads(next(f))
        return list(data.keys())

In [13]:
def get_data_set_log(data_src_dir: str) -> List[dict]:
    data_set = []
    for path, _, files in os.walk(data_src_dir):
        for file in files:
            with open(os.path.join(path, file), 'r') as f:
                for row in f:
                    data_set.append(json.loads(row))
    return data_set

In [14]:
def parse_log_data(data: dict) -> dict:
    parsed = {}
    for key in data.keys():
        parsed[key] = len(str(data[key])) if data[key] else -1
    return parsed

In [15]:
# get the property names in log data
prop_name_list_log = get_prop_name_list_log('data/log_data/2018/11/2018-11-01-events.json')

In [16]:
# get data set of log data
data_set_log = get_data_set_log('data/log_data')

In [17]:
parsed_data_set_log = list(map(parse_log_data, data_set_log))

In [18]:
show_length_stat(prop_name_list_log, parsed_data_set_log)

prop_name           prop_len            prop_cnt           prop_mean        prop_max_len        prop_min_len
artist                 89063                6820                  13                  89                   2
auth                   72790                8056                   9                  10                   9
firstName              43077                7770                   5                  10                   3
gender                  7770                7770                   1                   1                   1
itemInSession          11691                7115                   1                   3                   1
lastName               46185                7770                   5                   9                   3
length                 60388                6820                   8                  10                   6
level                  32224                8056                   4                   4                   4
location           