In [1]:
import ujson
import copy
import numpy as np
import scipy.stats
import sys

In [2]:
# Testing auto adding time output to all cells
'''
#To install...
wget https://raw.githubusercontent.com/cpcloud/ipython-autotime/master/autotime.py
#Make available via
jupyter nbextension install /Users/patrickmulrooney/class/notebooks/pjmulroo/DSECapstone/autotime.py
'''
 
%load_ext autotime

In [3]:
def log_progress(sequence, every=None, size=None):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{index} / ?'.format(index=index)
                else:
                    progress.value = index
                    label.value = u'{index} / {size}'.format(
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = str(index or '?')


time: 33.4 ms


In [4]:
# Read config from local file
GMAP_API_KEY = None

sys.path.append('/Users/patrickmulrooney/Desktop/')
import capstone_config

_file = capstone_config._INPUT_FILE

print _file

/Users/patrickmulrooney/Downloads/endomondoHR/endoHRParsed_no_outliers.json
time: 3.2 ms


In [4]:
#time: 12min 44s
# Full dataset
#_file = "/Users/patrickmulrooney/Downloads/endomondoHR/endoHRParsed.json"
# Partial dataset
#_file = "/Users/patrickmulrooney/Downloads/endomondoHR/endoHRParsed_10k.json"

endoHR = []

with open(_file, 'r') as f:
    endoHR += ujson.load(f)

time: 14min 11s


In [5]:
print "Length of import: %s"%(len(endoHR))

Length of import: 253020
time: 9.33 ms


## Remove outlier data
```
series_keys = { True: ['timestamp', 'heart_rate', 'altitude', 'latitude', 'longitude', 'speed' ], \
```

#### Using reasonable values from analysis

In [11]:
hr_removed = 0
alt_removed = 0
speed_removed = 0

for _k in endoHR:
    to_be_removed = []
    for _i, _v in enumerate(_k['series']):
        # Heart rate between 40 and 250
        if _v[1] < 40 or _v[1] > 250:
            hr_removed += 1
            to_be_removed.append(_i)
        # Altitude to be removed
        elif float(_v[2]) < -500.0 or float(_v[2]) > 4000.0:
            alt_removed += 1
            to_be_removed.append(_i)
        # Speed to be removed
        elif _k['speed_included'] and (float(_v[5]) < 0.0 or float(_v[5]) > 75.0):
            speed_removed += 1
            to_be_removed.append(_i)

    _k['series'] = [v for i, v in enumerate(_k['series']) if i not in to_be_removed]
            
exercises_to_be_removed = []
for _j, _k in enumerate(endoHR):
    # Remove anything that no longer has at least one series event
    if len(_k['series']) == 0:
        exercises_to_be_removed.append(_j)

endoHR = [v for i, v in enumerate(endoHR) if i not in exercises_to_be_removed]
            
print "Removed %s exercise records due to no remaining events"%len(exercises_to_be_removed)
print "Removed %s time series events entries due to heart rate"%hr_removed
print "Removed %s time series events entries due to altitude"%alt_removed
print "Removed %s time series events entries due to speed"%speed_removed

Removed 0 exercise records due to no remaining events
Removed 0 time series events entries due to heart rate
Removed 0 time series events entries due to altitude
Removed 0 time series events entries due to speed
time: 3min 25s


#### Using standard deviations.

In [9]:
hr_removed = 0
alt_removed = 0
speed_removed = 0

for _k in endoHR:
    to_be_removed = []
    for _i, _v in enumerate(np.abs(scipy.stats.zscore(_k['series'], axis=1)) < 2):
        # Heart rate between 40 and 250
        if _v[1] == False:
            hr_removed += 1
            to_be_removed.append(_i)
        # Altitude to be removed
        elif _v[2] == False:
            alt_removed += 1
            to_be_removed.append(_i)
        # Speed to be removed
        elif _k['speed_included'] and _v[5] == False:
            speed_removed += 1
            to_be_removed.append(_i)

    _k['series'] = [v for i, v in enumerate(_k['series']) if i not in to_be_removed]

exercises_to_be_removed = []
for _j, _k in enumerate(endoHR):
    # Remove anything that no longer has at least one series event
    if len(_k['series']) == 0:
        exercises_to_be_removed.append(_j)

endoHR = [v for i, v in enumerate(endoHR) if i not in exercises_to_be_removed]
            
print "Removed %s exercise records due to no remaining events"%len(exercises_to_be_removed)
print "Removed %s time series events entries due to heart rate"%hr_removed
print "Removed %s time series events entries due to altitude"%alt_removed
print "Removed %s time series events entries due to speed"%speed_removed

Removed 0 exercise records due to no remaining events
Removed 0 time series events entries due to heart rate
Removed 0 time series events entries due to altitude
Removed 0 time series events entries due to speed
time: 7min 15s


## Write parsed data to avoid needing to run this multiple times

In [12]:
# time: 3min 41s

with open(capstone_config._OUTPUT_FILE_3, 'w') as f:
     ujson.dump(endoHR, f)

time: 3min 29s
