In [525]:
import pandas as pd
import numpy as np
import datetime
from ics import Calendar, Event
import boto3

In [6]:
def ts_to_dt(ts):
    return datetime.datetime.fromtimestamp(ts)

In [176]:
def process_raw_data(file):
    """
    Create [date, source] from file.
    :param file: as exported by Auto Health Export
    """
    df = pd.read_csv(file, sep = ',')
    df['creation_date'] = ts_to_dt(file.stat().st_atime)
    df['filename'] = file.name

    return df

In [189]:
def read_raw_files(str_path):
    """
    Read all files in a directory and return a dataframe.
    :param str_path: directory path as type string
    """
    df = pd.DataFrame()
    print('Reading files..')
    for i in os.scandir(str_path):
        if i.name.startswith('HealthAutoExport') and i.name.endswith('Data.csv'):
            print(f'Reading: {i.name}')
            df_tmp = process_raw_data(i)
            df = pd.concat([df, df_tmp])

    # concat results in weird indices
    df = df.reset_index(drop=True)
    return df 

### Transformations

Functions to cleanse the data 
- Dedupe values
- Cleanse trim all values to closest integer except for sleep and weight
- Create the following columns
  - `Calories`

In [491]:
def create_numeric_cols(df):
    """
    Calculates the total calories from the macros
    Calculates the sleep efficiency
    """
    df['calories'] = df['carbs'] * 4 + df['fat'] * 9 + df['protein'] * 4
    df['sleep_eff'] = df['sleep_asleep'] / df['sleep_in_bed'] * 100
    df['sleep_eff'] = df['sleep_eff'].fillna(0)
    df['sleep_eff'] = df['sleep_eff'].astype(int)

    return df

In [492]:
def round_df(df):
    """
    Round all numerical columns to closest integer except for one d.p. cols 
    Replaces all NaN with null
    """
    one_dp_cols = ['sleep_asleep', 'sleep_in_bed', 'weight']
    for i in df.columns:
        if df[i].dtypes == 'float64':
            if i in one_dp_cols: 
                df[i] = df[i].round(1)
            else:
                df[i] = df[i].astype(int)

    df = df.replace({np.nan: None})
    return df

In [493]:
def convert_column_types(df): 
    """
    Convert certain columns to be a certain type 
    """
    df['date'] = pd.to_datetime(df['date']).dt.date

    # force apply float64 type for weight 
    df['weight'] = df['weight'].astype(float)

    return df

In [494]:
def rename_columns(df):
    """
    Rename columns for easier reference
    Styling follows lowercase and no units with spaces being replaced by _
    """
    d_col_rename = {
        'Date': 'date',
        'Carbohydrates (g)': 'carbs',
        'Protein (g)': 'protein',
        'Total Fat (g)': 'fat',
        'Sleep Analysis [In Bed] (hr)': 'sleep_in_bed',
        'Sleep Analysis [Asleep] (hr)': 'sleep_asleep',
        'Step Count (count)': 'steps',
        'Weight & Body Mass (kg) ': 'weight'
    }

    df.rename(columns=d_col_rename, inplace=True)

    # fill in values 
    df = df.replace(r'^\s+$', np.nan, regex=True)

    # convert column types 
    df = convert_column_types(df)
    return df

In [495]:
def dedup_df(df):
    """
    Remove duplicates ordering by 'date' and 'creation_date' and then keep only the latest 
    """
    df_sort = df.sort_values(['date', 'creation_date'], ascending= True)
    df_dedup = df_sort.drop_duplicates(subset = 'date', keep = 'last')

    return df_dedup

In [506]:
def create_description_cols(df): 
    """
    Create description columns for the generating events
    """
    # convert all columns to strings for easy manipulation
    df_1 = df.astype(str)

    food_macros = "(" + df_1['carbs'] + "C/" + df_1['protein'] + "P/" + df_1['fat'] + "F" + ")"
    df['food'] = df_1['calories'] + " calories " + food_macros
    df['activity'] = df_1['steps'] + " steps"
    
    df['sleep'] = df_1['sleep_asleep'] + " h" + " (" + df_1['sleep_eff'] + "% eff.)"
    df['sleep'] = df['sleep'].replace('None h (0% eff.)', 'No sleep data.')
    
    return df

In [544]:
def generate_calendar(df, **kwargs): 
    """
    Generates a CSV and ICS from the dataframe
    :param df: cleansed dataframe from `create_description_cols`
    """
    df_event = df[['date', 'food', 'activity', 'sleep']].melt(
        id_vars = ['date'], 
        value_vars = ['food', 'activity', 'sleep'], 
        var_name = 'type', 
        value_name = 'description'
    )
    output_file = kwargs['output_path'] + "/" + kwargs['file_name'] + '.csv'
    df_event.to_csv(output_file)

    c = Calendar()
    for _, row in df_event.iterrows(): 
        e = create_event(row['date'], row['type'], row['description'])
        c.events.add(e)

    with open(kwargs['file_name'] + '.ics', 'w') as f:
        f.write(str(c))
        f.close()

    return df 

In [545]:
def transform(df, **kwargs):
    """
    Round all numerical columns to closest integer except for sleep times and weight
    :param df: dataframe from the read_raw_files function 
    """
    if len(df) > 0: 
        df = rename_columns(df)
        df = round_df(df)
        df = dedup_df(df)
        df = create_numeric_cols(df)
        df = create_description_cols(df)
        df = generate_calendar(df, **kwargs)
        return df

In [548]:
def etl_raw_data(**kwargs):
    """
    Perform ETL on Apple Health data
    :param input_path: directory path as type string
    """
    # read all files in directory
    input_path = kwargs.get('input_path', os.getcwd())
    output_path = kwargs.get('output_path', os.getcwd())
    file_name = kwargs.get('file_name', 'apple-health-calendar')
    
    df = read_raw_files(input_path)
    
    df = transform(df, output_path = output_path, file_name = file_name)

    return df

In [551]:
def create_event(date, type, description):
    """
    Create an all day event for the given date and type
    :param date: date as type datetime.date
    :param type: type of event as type string
    :param description: description of event as type string
    """
    if type == 'sleep':
        emoticon = "💤"
    if type == 'activity':
        emoticon = "🔥"
    if type == 'food':
        emoticon = "🥞"

    all_day_date = str(date) + " 00:00:00"
    e = Event()
    e.name = emoticon + " " + description
    e.begin = all_day_date
    e.end = all_day_date
    e.make_all_day()

    return e