In [525]:
import pandas as pd
import numpy as np
import datetime
from ics import Calendar, Event
import boto3

In [6]:
def ts_to_dt(ts):
    return datetime.datetime.fromtimestamp(ts)

In [176]:
def process_raw_data(file):
    """
    Create [date, source] from file.
    :param file: as exported by Auto Health Export
    """
    df = pd.read_csv(file, sep = ',')
    df['creation_date'] = ts_to_dt(file.stat().st_atime)
    df['filename'] = file.name

    return df

In [189]:
def read_raw_files(str_path):
    """
    Read all files in a directory and return a dataframe.
    :param str_path: directory path as type string
    """
    df = pd.DataFrame()
    print('Reading files..')
    for i in os.scandir(str_path):
        if i.name.startswith('HealthAutoExport') and i.name.endswith('Data.csv'):
            print(f'Reading: {i.name}')
            df_tmp = process_raw_data(i)
            df = pd.concat([df, df_tmp])

    # concat results in weird indices
    df = df.reset_index(drop=True)
    return df 

### Transformations

Functions to cleanse the data 
- Dedupe values
- Cleanse trim all values to closest integer except for sleep and weight
- Create the following columns
  - `Calories`

In [491]:
def create_numeric_cols(df):
    """
    Calculates the total calories from the macros
    Calculates the sleep efficiency
    """
    df['calories'] = df['carbs'] * 4 + df['fat'] * 9 + df['protein'] * 4
    df['sleep_eff'] = df['sleep_asleep'] / df['sleep_in_bed'] * 100
    df['sleep_eff'] = df['sleep_eff'].fillna(0)
    df['sleep_eff'] = df['sleep_eff'].astype(int)

    return df

In [492]:
def round_df(df):
    """
    Round all numerical columns to closest integer except for one d.p. cols 
    Replaces all NaN with null
    """
    one_dp_cols = ['sleep_asleep', 'sleep_in_bed', 'weight']
    for i in df.columns:
        if df[i].dtypes == 'float64':
            if i in one_dp_cols: 
                df[i] = df[i].round(1)
            else:
                df[i] = df[i].astype(int)

    df = df.replace({np.nan: None})
    return df

In [493]:
def convert_column_types(df): 
    """
    Convert certain columns to be a certain type 
    """
    df['date'] = pd.to_datetime(df['date']).dt.date

    # force apply float64 type for weight 
    df['weight'] = df['weight'].astype(float)

    return df

In [494]:
def rename_columns(df):
    """
    Rename columns for easier reference
    Styling follows lowercase and no units with spaces being replaced by _
    """
    d_col_rename = {
        'Date': 'date',
        'Carbohydrates (g)': 'carbs',
        'Protein (g)': 'protein',
        'Total Fat (g)': 'fat',
        'Sleep Analysis [In Bed] (hr)': 'sleep_in_bed',
        'Sleep Analysis [Asleep] (hr)': 'sleep_asleep',
        'Step Count (count)': 'steps',
        'Weight & Body Mass (kg) ': 'weight'
    }

    df.rename(columns=d_col_rename, inplace=True)

    # fill in values 
    df = df.replace(r'^\s+$', np.nan, regex=True)

    # convert column types 
    df = convert_column_types(df)
    return df

In [495]:
def dedup_df(df):
    """
    Remove duplicates ordering by 'date' and 'creation_date' and then keep only the latest 
    """
    df_sort = df.sort_values(['date', 'creation_date'], ascending= True)
    df_dedup = df_sort.drop_duplicates(subset = 'date', keep = 'last')

    return df_dedup

In [506]:
def create_description_cols(df): 
    """
    Create description columns for the generating events
    """
    # convert all columns to strings for easy manipulation
    df_1 = df.astype(str)

    food_macros = "(" + df_1['carbs'] + "C/" + df_1['protein'] + "P/" + df_1['fat'] + "F" + ")"
    df['food'] = df_1['calories'] + " calories " + food_macros
    df['activity'] = df_1['steps'] + " steps"
    
    df['sleep'] = df_1['sleep_asleep'] + " h" + " (" + df_1['sleep_eff'] + "% eff.)"
    df['sleep'] = df['sleep'].replace('None h (0% eff.)', 'No sleep data.')
    
    return df

In [544]:
def generate_calendar(df, **kwargs): 
    """
    Generates a CSV and ICS from the dataframe
    :param df: cleansed dataframe from `create_description_cols`
    """
    df_event = df[['date', 'food', 'activity', 'sleep']].melt(
        id_vars = ['date'], 
        value_vars = ['food', 'activity', 'sleep'], 
        var_name = 'type', 
        value_name = 'description'
    )
    output_file = kwargs['output_path'] + "/" + kwargs['file_name'] + '.csv'
    df_event.to_csv(output_file)

    c = Calendar()
    for _, row in df_event.iterrows(): 
        e = create_event(row['date'], row['type'], row['description'])
        c.events.add(e)

    with open(kwargs['file_name'] + '.ics', 'w') as f:
        f.write(str(c))
        f.close()

    return df 

In [545]:
def transform(df, **kwargs):
    """
    Round all numerical columns to closest integer except for sleep times and weight
    :param df: dataframe from the read_raw_files function 
    """
    if len(df) > 0: 
        df = rename_columns(df)
        df = round_df(df)
        df = dedup_df(df)
        df = create_numeric_cols(df)
        df = create_description_cols(df)
        df = generate_calendar(df, **kwargs)
        return df

In [548]:
def etl_raw_data(**kwargs):
    """
    Perform ETL on Apple Health data
    :param input_path: directory path as type string
    """
    # read all files in directory
    input_path = kwargs.get('input_path', os.getcwd())
    output_path = kwargs.get('output_path', os.getcwd())
    file_name = kwargs.get('file_name', 'apple-health-calendar')
    
    df = read_raw_files(input_path)
    
    df = transform(df, output_path = output_path, file_name = file_name)

    return df

In [551]:
def create_event(date, type, description):
    """
    Create an all day event for the given date and type
    :param date: date as type datetime.date
    :param type: type of event as type string
    :param description: description of event as type string
    """
    if type == 'sleep':
        emoticon = "💤"
    if type == 'activity':
        emoticon = "🔥"
    if type == 'food':
        emoticon = "🥞"

    all_day_date = str(date) + " 00:00:00"
    e = Event()
    e.name = emoticon + " " + description
    e.begin = all_day_date
    e.end = all_day_date
    e.make_all_day()

    return e

In [5]:
import pandas as pd
from datetime import datetime
# %%
def ts_to_dt(ts):
    return datetime.fromtimestamp(ts)

def process_health_data(file):
    """
    Create [date, source] from file.
    :param file: as exported by Auto Health Export
    """
    df = pd.read_csv(file, sep = ',')
    print(f'Processing: {file}')
    df['creation_date'] = ts_to_dt(file.stat().st_atime)
    df['filename'] = file.name

    return df

def read_raw_files(str_path):
    """
    Read all files in a directory and return a dataframe.
    :param str_path: directory path as type string
    """
    df_health = pd.DataFrame()
    df_sleep = pd.DataFrame()
    # valid_files = ['HealthAutoExport', 'AutoSleep']
    print('Reading files..')
    for i in os.scandir(str_path):
        if i.name.endswith('.csv'):
            df_tmp = process_health_data(i)
            if i.name.startswith('HealthAutoExport'):
                df_health = pd.concat([df_health, df_tmp])
            elif i.name.startswith('AutoSleep'):
                df_sleep = pd.concat([df_sleep, df_tmp])

    # concat results in weird indices
    # df = df.reset_index(drop=True)
    return df_sleep

In [6]:
df_sleep = read_raw_files('/Users/ntonthat/Library/Mobile Documents/iCloud~com~ifunography~HealthExport/Documents/raw')

Reading files..
Processing: <DirEntry 'HealthAutoExport-2022-06-23-2022-06-29 Data.csv'>
Processing: <DirEntry 'AutoSleep-20220601-to-20220630.csv'>
Processing: <DirEntry 'HealthAutoExport-2022-06-28-2022-06-28 Data.csv'>
Processing: <DirEntry 'HealthAutoExport-2022-07-01-2022-07-01 Data.csv'>
Processing: <DirEntry 'HealthAutoExport-2022-06-01-2022-06-27 Data.csv'>
Processing: <DirEntry 'HealthAutoExport-2022-06-30-2022-06-30 Data.csv'>
Processing: <DirEntry 'HealthAutoExport-2022-06-26-2022-06-26 Data.csv'>
Processing: <DirEntry 'HealthAutoExport-2022-06-27-2022-06-27 Data.csv'>


In [12]:
def convert_time_from_string(time):
    """
    Converts time as from a string, stripping the date and adding the AM / PM
    """
    time_as_dt = datetime.strptime(time.split(" ")[-1][:5], "%H:%M")
    # time_object = time_as_dt.strftime("%I:%M %p")

    return time_as_dt

In [13]:
def etl_autosleep_data(df): 
    """
    Cleans autosleep data into correct formatting
    """
    
    time_cols = ['bedtime', 'waketime', 'inBed', 'deep']
    for time_values in time_cols: 
        df[time_values + '_1'] = df[time_values].apply(lambda x: convert_time_from_string(x))
    
    return df

In [14]:
df_sleep.columns

Index(['ISO8601', 'fromDate', 'toDate', 'bedtime', 'waketime', 'inBed',
       'awake', 'fellAsleepIn', 'sessions', 'asleep', 'asleepAvg7',
       'efficiency', 'efficiencyAvg7', 'quality', 'qualityAvg7', 'deep',
       'deepAvg7', 'sleepBPM', 'sleepBPMAvg7', 'dayBPM', 'dayBPMAvg7',
       'wakingBPM', 'wakingBPMAvg7', 'hrv', 'hrvAvg7', 'SpO2Avg', 'SpO2Min',
       'SpO2Max', 'respAvg', 'respMin', 'respMax', 'tags', 'notes',
       'creation_date', 'filename'],
      dtype='object')

In [15]:
etl_autosleep_data(df_sleep)[['bedtime_1', 'waketime_1', 'inBed_1', 'deep_1']]


Unnamed: 0,bedtime_1,waketime_1,inBed_1,deep_1
0,1900-01-01 20:47:00,1900-01-01 06:06:00,1900-01-01 09:19:00,1900-01-01 00:15:00
1,1900-01-01 22:04:00,1900-01-01 05:11:00,1900-01-01 07:07:00,1900-01-01 02:17:00
2,1900-01-01 22:03:00,1900-01-01 05:36:00,1900-01-01 07:33:00,1900-01-01 01:30:00
3,1900-01-01 21:01:00,1900-01-01 06:35:00,1900-01-01 09:34:00,1900-01-01 01:33:00
4,1900-01-01 22:45:00,1900-01-01 07:29:00,1900-01-01 08:44:00,1900-01-01 03:14:00
5,1900-01-01 21:31:00,1900-01-01 07:36:00,1900-01-01 10:05:00,1900-01-01 03:52:00
6,1900-01-01 21:54:00,1900-01-01 06:12:00,1900-01-01 08:18:00,1900-01-01 03:16:00
7,1900-01-01 22:11:00,1900-01-01 07:22:00,1900-01-01 09:11:00,1900-01-01 02:53:00
8,1900-01-01 23:05:00,1900-01-01 06:02:00,1900-01-01 06:57:00,1900-01-01 02:00:00
9,1900-01-01 22:14:00,1900-01-01 05:54:00,1900-01-01 09:25:00,1900-01-01 01:32:00


In [20]:
df_sleep[['ISO8601', 'bedtime', 'waketime', 'inBed', 'deepAvg7', 'efficiency', 'quality']]

Unnamed: 0,ISO8601,bedtime,waketime,inBed,deepAvg7,efficiency,quality
0,2022-06-01T19:59:59+10:00,2022-05-31 20:47:00,2022-06-01 06:06:00,09:19:00,01:37:04,96.8,07:12:16
1,2022-06-02T19:59:59+10:00,2022-06-01 22:04:00,2022-06-02 05:11:00,07:07:00,01:56:40,100.0,05:48:45
2,2022-06-03T19:59:59+10:00,2022-06-02 22:03:00,2022-06-03 05:36:00,07:33:00,01:45:14,100.0,06:22:24
3,2022-06-04T19:59:59+10:00,2022-06-03 21:01:00,2022-06-04 06:35:00,09:34:00,01:36:42,78.6,06:05:48
4,2022-06-05T19:59:59+10:00,2022-06-04 22:45:00,2022-06-05 07:29:00,08:44:00,01:48:55,100.0,07:10:28
5,2022-06-06T19:59:59+10:00,2022-06-05 21:31:00,2022-06-06 07:36:00,10:05:00,02:06:19,94.9,08:29:18
6,2022-06-07T19:59:59+10:00,2022-06-06 21:54:00,2022-06-07 06:12:00,08:18:00,02:17:09,100.0,06:47:58
7,2022-06-08T19:59:59+10:00,2022-06-07 22:11:00,2022-06-08 07:22:00,09:11:00,02:39:44,88.7,07:00:20
8,2022-06-09T19:59:59+10:00,2022-06-08 23:05:00,2022-06-09 06:02:00,06:57:00,02:37:22,90.4,05:21:09
9,2022-06-10T19:59:59+10:00,2022-06-09 22:14:00,2022-06-10 05:54:00,09:25:00,02:37:34,100.0,06:33:31
