In [1]:
"""
Simple Python script to render Apple Health data from Auto Exports
"""
from operator import truediv
import pandas as pd
from pandas.api.types import is_numeric_dtype
import numpy as np
from datetime import datetime
import boto3
import os
import yaml
import json
import flatdict

from ics import Calendar, Event

In [2]:
base_folder = os.getcwd() 
source_folder = base_folder + '/healthlake/'

In [3]:
os.listdir(source_folder)

['2022-09-16T05:18:49.175751.json', '.DS_Store']

In [27]:
import glob
apple_health_files = glob.glob(source_folder + '*.json')

fitness=[
    'carbohydrates',
    'dietary_caffeine',
    'dietary_energy',
    'dietary_sugar',
    'fiber',
    'protein',
    'sleep_analysis',
    'total_fat',
    'weight_body_mass'
]

nutrition=[
    'carbohydrates',
    'dietary_caffeine',
    'dietary_sugar',
    'fiber',
    'protein',
    'total_fat',
]

macros=[
    'carbohydrates',
    'total_fat',
    'protein'
]

fitess_cols = [
    'qty',
    'dates',
    'name',
    'units'
]
df_raw = pd.DataFrame()

for json_file in apple_health_files: 
    json_raw = pd.read_json(json_file, lines = True)
    df_raw = pd.concat([df_raw, json_raw])

df_raw['dates'] = pd.to_datetime(df_raw['date']).dt.date.apply(lambda x: str(x))
df_raw['qty'] = df_raw['qty'].fillna(df_raw['asleep'])

In [28]:
df_fitness = df_raw[df_raw['name'].isin(fitness)][fitess_cols]
df_fitness['qty'] = [np.round(qty_value, 1) for qty_value in df_fitness['qty']]
df_fitness['is_dietary'] = df_fitness['name'].isin(nutrition)
df_fitness.reset_index(drop=True,inplace=True)

In [53]:
def calculate_calories(
    carbs,
    fats,
    protein
): 
    calculated_calories = (carbs + protein) * 4 + 9 * fats
    
    return calculated_calories

In [85]:
df_fitness.query(f"dates=='2022-09-16' and name=='carbohydrates'")['qty'].values[0]

265.3

In [37]:
daily_macros

Unnamed: 0,qty,dates,name,units,is_dietary
0,184.1,2022-09-10,carbohydrates,g,True
29,184.6,2022-09-10,protein,g,True
43,51.2,2022-09-10,total_fat,g,True


In [93]:
for day in df_fitness['dates'].unique(): 
    carbs = df_fitness.query(f"dates=='{day}' and name=='carbohydrates'").qty.values[0]
    fibre = df_fitness.query(f"dates=='{day}' and name=='fiber'").qty.values[0]
    fats = df_fitness.query(f"dates=='{day}' and name=='total_fat'").qty.values[0]
    protein = df_fitness.query(f"dates=='{day}' and name=='protein'").qty.values[0]
        
    calories = np.ceil(4*carbs + 4*protein + 9*fats)
    print(calories)

1936.0
2113.0
2329.0
1943.0
2722.0
2576.0
2215.0


['carbohydrates',
 'carbohydrates',
 'carbohydrates',
 'carbohydrates',
 'carbohydrates',
 'carbohydrates',
 'carbohydrates',
 'protein',
 'protein',
 'protein',
 'protein',
 'protein',
 'protein',
 'protein',
 'total_fat',
 'total_fat',
 'total_fat',
 'total_fat',
 'total_fat',
 'total_fat',
 'total_fat']

In [6]:
json_raw.columns

Index(['qty', 'date', 'name', 'units', 'Avg', 'Min', 'Max', 'sleepStart',
       'sleepEnd', 'inBedEnd', 'sleepSource', 'asleep', 'inBed', 'inBedStart',
       'inBedSource', 'dates'],
      dtype='object')

In [10]:


df_fitness = json_raw[json_raw['name'].isin(fitness_values)].copy() 
df_fitness['qty'].fillna(df_fitness['asleep'])

49       184.146040
50       195.810606
51       224.112857
52       213.875907
53       373.249320
54       280.213590
55       265.301367
62         1.680000
69      8245.987184
70      8864.127976
71      8050.140166
72      8231.143326
73     10907.972432
74     10662.232900
75      9393.770293
76        57.317663
77        36.878788
78       109.922857
79       107.627057
80       160.307840
81       141.502937
82        69.183717
96        26.600000
97        28.465152
98        65.828571
99        46.463571
100       55.580000
101       19.505530
102       39.151667
194      184.586433
195      187.667112
196      179.605714
197      167.182214
198      174.078800
199      166.179737
200      127.826300
234        9.483333
235        7.283333
236        8.516667
237        7.600000
238        9.033333
239        6.466667
240        7.583333
275       51.163260
276       64.322504
277       79.277857
278       46.543307
279       59.248080
280       87.755190
281       71.298400


In [8]:
json_raw[json_raw['sleepStart'].notna()][['qty', 'date','name','units']]

Unnamed: 0,qty,date,name,units
234,,2022-09-10 16:05:00+09:30,sleep_analysis,hr
235,,2022-09-11 02:12:00+09:30,sleep_analysis,hr
236,,2022-09-12 03:45:00+09:30,sleep_analysis,hr
237,,2022-09-13 05:35:08+09:30,sleep_analysis,hr
238,,2022-09-13 21:15:00+09:30,sleep_analysis,hr
239,,2022-09-15 06:15:00+09:30,sleep_analysis,hr
240,,2022-09-16 04:43:03+09:30,sleep_analysis,hr


In [9]:
json_raw['name'].unique()

array(['active_energy', 'apple_exercise_time', 'apple_stand_hour',
       'apple_stand_time', 'basal_energy_burned',
       'blood_oxygen_saturation', 'calcium', 'carbohydrates', 'copper',
       'dietary_caffeine', 'dietary_cholesterol', 'dietary_energy',
       'dietary_sugar', 'dietary_water', 'environmental_audio_exposure',
       'fiber', 'flights_climbed', 'folate', 'handwashing',
       'headphone_audio_exposure', 'heart_rate', 'heart_rate_variability',
       'iron', 'magnesium', 'manganese', 'monounsaturated_fat', 'niacin',
       'pantothenic_acid', 'polyunsaturated_fat', 'potassium', 'protein',
       'respiratory_rate', 'resting_heart_rate', 'riboflavin',
       'saturated_fat', 'selenium', 'six_minute_walking_test_distance',
       'sleep_analysis', 'sodium', 'stair_speed_down', 'stair_speed_up',
       'step_count', 'thiamin', 'total_fat', 'vo2_max', 'vitamin_a',
       'vitamin_b12', 'vitamin_b6', 'vitamin_c', 'vitamin_d', 'vitamin_e',
       'vitamin_k', 'walking_runnin

In [114]:

# %%
def ts_to_dt(ts):
    return datetime.fromtimestamp(ts)

def process_health_data(file):
    """
    Create [date, source] columns from files read in.
    :param file: as exported by Auto Health Export / Autosleep
    """
    df = pd.read_csv(file, sep = ',')
    if len(df.columns) > 1:
        print(f'Processing: {file.name}')
        df['creation_date'] = ts_to_dt(file.stat().st_atime)
        df['filename'] = file.name

        return df

def read_raw_files(str_path):
    """
    Read all files in a directory and return a dataframe.
    :param str_path: directory path as type string
    """
    df_health = pd.DataFrame()
    df_sleep = pd.DataFrame()
    # valid_files = ['HealthAutoExport', 'AutoSleep']
    print('Reading files..')
    file_list = os.scandir(str_path)
    csv_files = [f for f in file_list if f.name.endswith('.csv')]

    for i in csv_files:
        df_tmp = process_health_data(i)
        if i.name.startswith('HealthAutoExport'):
            df_health = pd.concat([df_health, df_tmp])
        elif i.name.startswith('AutoSleep'):
            df_sleep = pd.concat([df_sleep, df_tmp])

    return df_health, df_sleep


# %% [markdown]
# ### Transformations

# %% [markdown]
# Functions to cleanse the data
# - Rename columns
# - Dedupe values
# - Cleanse trim all values to closest integer except for sleep and weight
# - Create the following columns
#   - `Calories`


# %%
def update_columns(df, col_map):
    """
    Rename columns for easier reference
    Styling follows lowercase and no units with spaces being replaced by _
    """

    df.rename(columns=col_map, inplace=True)

    # fill in values
    df = df.replace(r'^\s+$', np.nan, regex=True)

    # convert column types
    df['date'] = pd.to_datetime(df['date']).dt.date

    # force apply float64 type for weight
    df['body_weight'] = df['body_weight'].astype(float)

    # Update column types
    df['calories'] = df['carbs'] * 4 + df['fat'] * 9 + df['protein'] * 4
    df['sleep_eff'] = df['sleep_asleep'] / df['sleep_in_bed'] * 100
    df['sleep_eff'] = df['sleep_eff'].fillna(0)
    df['sleep_eff'] = df['sleep_eff'].astype('int64')

    # Create boolean for beating threshold
    df['exercise'] = [1 if x > 30 else 0 for x in df['exercise_mins'].fillna(0)]
    df['mindful'] = [1 if x > 5 else 0 for x in df['mindful_mins'].fillna(0)]

    return df

def round_df(df):
    """
    Round all numerical columns to closest integer except for one d.p. cols
    Replaces all NaN with null
    """
    one_dp_cols = ['sleep_asleep', 'sleep_in_bed', 'body_weight']
    for i in df.columns:
        if df[i].dtypes == 'float64':
            if i in one_dp_cols:
                df[i] = df[i].round(1)
            else:
                df[i] = np.floor(pd.to_numeric(df[i], errors= 'coerce')).astype('Int64')

    return df

def dedup_df(df):
    """
    Remove duplicates ordering by 'date' and 'creation_date' and then keep only the latest
    """
    df_sort = df.sort_values(['date', 'creation_date'], ascending= True)
    df_dedup = df_sort.drop_duplicates(subset = 'date', keep = 'last')

    return df_dedup


# %%
def create_description_cols(df, is_autosleep=False):
    """
    Create description columns for the generating events
    Converts events into boolean
    """
    print("Creating description columns for calendar events")
    # cleansing Autosleep data
    if is_autosleep:
        print("Updating sleep statistics")
        df['dsc_sleep'] =  df.agg(lambda x:
            f"Deep sleep: {x['deep']} \r\n"
            f"Sleep efficiency: {int(x['efficiency'])}% \r\n"
            f"Bedtime: 🌒 {x['bedtime']} \r\n"
            f"Wakeup time: 🌞 {x['waketime']}",
            axis=1
        )

        df['sleep'] = df.agg(lambda x: f"{x['asleep']}", axis = 1)
        print(df.head())
        return df

    # cleansing Apple Health Data
    else:
        df_wa = weekly_average(df)
        # adding commas to thousand
        # for i in df.columns:
        #     if df[i].dtypes == 'float64':
        #         df[i] = df[i].apply(lambda x: f"{x:,.1f}")
        #     elif df[i].dtypes in ('int64', 'Int64'):
        #         df[i] = df[i].map('{:,.0f}'.format)

        # print(df.columns)
        # # Create columns descriptions for event description
        # df['dsc_food'] = [f"{a}P / {b}C / {c}F\r\nFibre: {d} g" for a,b,c,d in zip(df['protein'], df['carbs'], df['fat'], df['fibre'])]

        # df['dsc_activity'] = df.agg(lambda x:
        #     f"{x['exercise_mins']} mins of exercise and "
        #     f"{x['mindful_mins']} mindful mins",
        #     axis=1
        # )

        # df['dsc_sleep'] = df.agg(lambda x:
        #     f"{x['sleep_asleep']} hrs asleep and "
        #     f"{x['sleep_in_bed']} hrs in bed",
        #     axis=1
        # )

        # # Create basic column descriptions for event names
        # df['food'] = [f"{a} calories" for a in df['calories']]
        # df['activity'] = [f"{a} steps" for a in df['steps']]
        # df['sleep'] = [f"{a} h ({b} % eff.)" for a,b in zip(df['sleep_asleep'], df['sleep_eff'])]
        # df['weight'] = [f"{a} kg" for a in df['body_weight']]

        # # Cleanse data
        # df['sleep'] = df['sleep'].replace('nan h (0% eff.)', 'No sleep data.')

        # # merge with weekly average data
        # df = pd.merge(left = df, right = df_wa, how = 'left', on = 'date').fillna('')
        # print(df[df['dsc_average'] != ""].head())

    return df,df_wa

def convert_autosleep_time(time, is_24h=False):
    """
    Converts time from a string; stripping the date and adding the AM / PM / hours and minutes
    """
    time_dt = time.split(" ")[-1][:5]

    if is_24h:
        time_dt = datetime.strptime(time_dt, "%H:%M")
        time_dt = time_dt.strftime("%-I:%M %p")
    else:
        hours = int(time_dt.split(":")[0])
        min = int(time_dt.split(":")[1])
        time_dt = f"{hours} h {min} m"

    return time_dt

def etl_autosleep_data(df):
    """
    Cleans autosleep data into correct formatting
    """
    #  Clean up the time columns with either 12 h format (AM / PM) or with hours and minutes
    time_dict = {
        '24h': ['bedtime', 'waketime'],
        'hrs': ['asleep', 'deep', 'asleepAvg7']
    }
    for time_type, time_cols in time_dict.items():
        is_24h = 0
        for time_col in time_cols:
            if time_type == '24h': is_24h = 1
            df[time_col] = df[time_col].apply(lambda x: convert_autosleep_time(x, is_24h))

    # Collect the date
    df['date'] = df['ISO8601'].apply(lambda x: datetime.strptime(x.split("T")[0], '%Y-%m-%d').date())

    df = create_description_cols(df, is_autosleep=True)

    # Remove duplicates
    df = dedup_df(df)

    return df


def get_config(config_file):
    """
    Generate configs are read from config.yml
    If no values defined, return as current working directory
    """
    config = yaml.load(open(config_file, "r"),  Loader=yaml.FullLoader)
    config = flatdict.FlatDict(config, delimiter = '.')
    for k, v in config.items():
        if not k.startswith('type'):
            if v == "": config[k] = os.getcwd()

    return config


In [115]:
config = get_config('config.yml')
input_path = config.get('input.raw_path')
output_path = config.get('output.raw_path')
output_cal = config.get('output.calendar_path')
output_file_name = config.get('output.file_name')
region = config.get('type.region')
col_map = config.get('col_map')

In [116]:
df, df_sleep = read_raw_files(input_path)

Reading files..
Processing: HealthAutoExport-2022-06-23-2022-06-29 Data.csv
Processing: HealthAutoExport-2022-07-06-2022-07-06 Data.csv
Processing: HealthAutoExport-2022-07-17-2022-07-23 Data.csv
Processing: HealthAutoExport-2022-07-19-2022-07-25 Data.csv
Processing: HealthAutoExport-2022-07-07-2022-07-07 Data.csv
Processing: HealthAutoExport-2022-07-16-2022-07-22 Data.csv
Processing: HealthAutoExport-2022-07-21-2022-07-27 Data.csv
Processing: AutoSleep-20220701-to-20220806.csv
Processing: HealthAutoExport-2022-07-30-2022-08-05 Data.csv
Processing: HealthAutoExport-2022-07-27-2022-08-02 Data.csv
Processing: HealthAutoExport-2022-07-29-2022-08-04 Data.csv
Processing: AutoSleep-20220601-to-20220630.csv
Processing: HealthAutoExport-2022-07-04-2022-07-10 Data.csv
Processing: HealthAutoExport-2022-08-01-2022-08-07 Data.csv
Processing: HealthAutoExport-2022-07-05-2022-07-11 Data.csv
Processing: HealthAutoExport-2022-06-28-2022-06-28 Data.csv
Processing: HealthAutoExport-2022-07-01-2022-07-01

In [117]:
df, df_sleep = read_raw_files(input_path)
df = update_columns(df, col_map)
df = round_df(df)
df = dedup_df(df)
if len(df_sleep) > 0:
    df_health_sleep = etl_autosleep_data(df_sleep)[['date', 'dsc_sleep', 'sleep', 'asleepAvg7']]
# df, df_wa = create_description_cols(df)
# df = df.reset_index(drop=True)

Reading files..
Processing: HealthAutoExport-2022-06-23-2022-06-29 Data.csv
Processing: HealthAutoExport-2022-07-06-2022-07-06 Data.csv
Processing: HealthAutoExport-2022-07-17-2022-07-23 Data.csv
Processing: HealthAutoExport-2022-07-19-2022-07-25 Data.csv
Processing: HealthAutoExport-2022-07-07-2022-07-07 Data.csv
Processing: HealthAutoExport-2022-07-16-2022-07-22 Data.csv
Processing: HealthAutoExport-2022-07-21-2022-07-27 Data.csv
Processing: AutoSleep-20220701-to-20220806.csv
Processing: HealthAutoExport-2022-07-30-2022-08-05 Data.csv
Processing: HealthAutoExport-2022-07-27-2022-08-02 Data.csv
Processing: HealthAutoExport-2022-07-29-2022-08-04 Data.csv
Processing: AutoSleep-20220601-to-20220630.csv
Processing: HealthAutoExport-2022-07-04-2022-07-10 Data.csv
Processing: HealthAutoExport-2022-08-01-2022-08-07 Data.csv
Processing: HealthAutoExport-2022-07-05-2022-07-11 Data.csv
Processing: HealthAutoExport-2022-06-28-2022-06-28 Data.csv
Processing: HealthAutoExport-2022-07-01-2022-07-01

In [102]:
df_health_sleep.columns

Index(['ISO8601', 'fromDate', 'toDate', 'bedtime', 'waketime', 'inBed',
       'awake', 'fellAsleepIn', 'sessions', 'asleep', 'asleepAvg7',
       'efficiency', 'efficiencyAvg7', 'quality', 'qualityAvg7', 'deep',
       'deepAvg7', 'sleepBPM', 'sleepBPMAvg7', 'dayBPM', 'dayBPMAvg7',
       'wakingBPM', 'wakingBPMAvg7', 'hrv', 'hrvAvg7', 'sleepHRV',
       'sleepHRVAvg7', 'SpO2Avg', 'SpO2Min', 'SpO2Max', 'respAvg', 'respMin',
       'respMax', 'tags', 'notes', 'creation_date', 'filename', 'date',
       'dsc_sleep', 'sleep'],
      dtype='object')

In [104]:
df_health_sleep.iloc[0:1]['asleep']

0    9 h 1 m
Name: asleep, dtype: object

In [120]:

def weekly_average(df, df_health_sleep):
    """
    Calculates the weekly average statistics for any numeric columns
    Resets on Sunday
    """
    wa_cols= ['date', 'body_weight', 'calories', 'carbs', 'exercise', 'exercise_mins', 'fat', 'fibre', 'protein', 'steps']
    
    # create rolling 7 day average but filter out for Sunday
    df_avg = df[wa_cols].rolling(on = 'date', window = 7).mean().dropna()
    df_avg['day'] = [i.weekday() for i in df_avg['date']]
    df_avg = df_avg[df_avg['day'] == 6].reset_index()

    # calculate total
    df_sum = df[['date', 'calories']].rolling(on = 'date', window = 7).sum().dropna()
    # join data together (average plus sleep)
    df_wa = round_df(
            pd.merge(
                left = pd.merge(
                    left = df_avg,
                    right = df_sum,
                    how = 'inner',
                    on = 'date'
                ),
                right = df_health_sleep,
                how = 'left',
                on = 'date'
        )
    )

    print(df_wa.columns)

    # renaming
    df_wa.rename(columns = {
        'calories_x': 'calories_avg',
        'calories_y': 'calories_sum'
    }, inplace = True)


    df_wa['dsc_average'] = df_wa.agg(lambda x:
        # f"{x['protein']} P / {x['carbs']} C / {x['fat']} F\r\n"
        # f"{x['body_weight']} kg\r\n"
        # f"{x['steps']} steps\r\n"
        f"{x['asleepAvg7']} of sleep\r\n"
        f"Total Budget: {x['calories_sum']} calories",
        axis = 1
    )

    df_wa['average'] = df_wa.agg(lambda x:
        f"{x['calories_avg']} calories",
        axis = 1
    )

    return df_wa[['date', 'dsc_average', 'average']]


weekly_average(df, df_health_sleep)

Index(['index', 'date', 'body_weight', 'calories_x', 'carbs', 'exercise',
       'exercise_mins', 'fat', 'fibre', 'protein', 'steps', 'day',
       'calories_y', 'dsc_sleep', 'sleep', 'asleepAvg7'],
      dtype='object')
   index        date  body_weight  calories_avg  carbs  exercise  \
0      1  2022-07-24         73.7          2088    231         1   
1      1  2022-07-31         73.3          1971    207         1   
2      6  2022-08-07         73.1          2004    219         1   

   exercise_mins  fat  fibre  protein  steps  day  calories_sum  \
0             96   57     28      160  15517    6         14619   
1             99   51     20      167  16801    6         13799   
2             96   48     31      170  15642    6         14032   

                                           dsc_sleep     sleep asleepAvg7  
0  Deep sleep: 2 h 48 m \r\nSleep efficiency: 100...  7 h 23 m   7 h 30 m  
1  Deep sleep: 1 h 45 m \r\nSleep efficiency: 100...  6 h 17 m   7 h 30 m  
2        

Unnamed: 0,date,dsc_average,average
0,2022-07-24,7 h 30 m of sleep\r\nTotal Budget: 14619 calories,2088 calories
1,2022-07-31,7 h 30 m of sleep\r\nTotal Budget: 13799 calories,1971 calories
2,2022-08-07,nan of sleep\r\nTotal Budget: 14032 calories,2004 calories


In [113]:
df

Unnamed: 0,date,carbs,protein,sleep_asleep,sleep_in_bed,steps,fat,body_weight,creation_date,filename,exercise_mins,fibre,mindful_mins,calories,sleep_eff,exercise,mindful
0,2022-06-01,279,104,9.0,9.3,10957,61,,2022-07-30 12:59:46.466966,HealthAutoExport-2022-06-01-2022-06-27 Data.csv,,,,2085,96,0,0
1,2022-06-02,308,146,,,10639,53,72.4,2022-07-30 12:59:46.466966,HealthAutoExport-2022-06-01-2022-06-27 Data.csv,,,,2300,0,0,0
2,2022-06-03,282,128,7.6,7.6,13124,52,,2022-07-30 12:59:46.466966,HealthAutoExport-2022-06-01-2022-06-27 Data.csv,,,,2119,100,0,0
3,2022-06-04,289,182,8.7,8.7,9345,86,,2022-07-30 12:59:46.466966,HealthAutoExport-2022-06-01-2022-06-27 Data.csv,,,,2671,100,0,0
4,2022-06-05,260,148,,,8875,78,,2022-07-30 12:59:46.466966,HealthAutoExport-2022-06-01-2022-06-27 Data.csv,,,,2341,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,2022-08-03,209,173,7.4,8.7,13792,59,73.1,2022-08-08 09:37:07.812804,HealthAutoExport-2022-08-01-2022-08-07 Data.csv,153,34,21,2065,85,1,1
3,2022-08-04,221,163,9.4,9.4,18936,44,72.9,2022-08-08 09:37:07.812804,HealthAutoExport-2022-08-01-2022-08-07 Data.csv,63,39,,1942,100,1,0
4,2022-08-05,249,191,,,15085,40,72.8,2022-08-08 09:37:07.812804,HealthAutoExport-2022-08-01-2022-08-07 Data.csv,71,36,,2127,0,1,0
5,2022-08-06,217,185,7.3,8.9,16240,47,73.3,2022-08-08 09:37:07.812804,HealthAutoExport-2022-08-01-2022-08-07 Data.csv,67,24,,2041,81,1,0
