In [158]:
import os 
import sys
import pandas as pd
import numpy as np
print(sys.executable)
from datetime import datetime


/Users/aktan/opt/anaconda3/envs/env-analythics/bin/python


In [159]:
data_path = os.path.join(os.getcwd(), "data")
apple_data_path = os.path.join(data_path, "apple_health_export")
base_xml_file = os.path.join(apple_data_path, "export.xml")

### Extract  data from xml into a proper df

In [160]:
import xml.etree.ElementTree as ET
tree = ET.parse(base_xml_file) 

In [161]:
root = tree.getroot()
record_list = [x.attrib for x in root.iter('Record')]
record_data = pd.DataFrame(record_list)
print("{0} of rows".format(len(record_data)))

1011790 of rows


### Making this df beatiful

* Format type column 

In [162]:
record_data["type_type"] = None
record_data["type_type"] = record_data.apply(lambda x: "continious" if "Quantity" in x["type"] else x["type_type"], axis=1)
record_data["type_type"] = record_data.apply(lambda x: "categorical" if "Category" in x["type"] else x["type_type"], axis=1)

record_data["type"] = record_data["type"].apply(lambda x: x.replace("HKQuantityTypeIdentifier", "")) 
record_data["type"] = record_data["type"].apply(lambda x: x.replace("HKCategoryTypeIdentifier", "")) 

* Format device column

In [163]:
def format_device_info(device_info):
    if pd.isna(device_info):
        return {"device_HKDevice":None, "device_name": None, 
                "device_manufacturer":None, "device_model": None, "device_hardware":None, "device_software":None}
    device_info = device_info.replace("<<", "").replace(">", "")
    fields = list(filter(lambda x: len(x) == 2, [x.strip().split(":") for x in device_info.split(",")]))
    return {"device_"+x:y for x, y in fields}
device_meta = pd.DataFrame([format_device_info(x) for x in record_data["device"]])
if len(device_meta) != len(record_data):
    print("Lenghths missmatch check the ambiguity, something went wrong device string")
apple_health_df = pd.concat([record_data, device_meta], axis=1)
apple_health_df = apple_health_df.drop(["device"], axis=1)

In [164]:
apple_health_df["device_name"].unique()

array(['iPhone', None, 'Apple Watch', 'EarPods', 'AirPods'], dtype=object)

In [165]:
apple_health_df["device_name"] = apple_health_df["device_name"].apply(lambda x: "Apple Watch" if pd.isna(x) else x)

* Filter out na valued rows

In [166]:
before_measurements = apple_health_df["type"].unique()

In [167]:
apple_health_df = apple_health_df.dropna(subset=["value"])

In [168]:
after_measurements = apple_health_df["type"].unique()

In [169]:
set(before_measurements).difference(set(after_measurements))

{'AbdominalCramps',
 'AppetiteChanges',
 'Bloating',
 'Fatigue',
 'HandwashingEvent',
 'HotFlashes',
 'Lactation',
 'LowerBackPain',
 'MemoryLapse',
 'MindfulSession',
 'MoodChanges',
 'PelvicPain'}

In [170]:
print("{0} of rows left removing null values".format(len(apple_health_df)))

1010901 of rows left removing null values


* Taking a look at data types and sizes so we set a threshold # of records threshold.

In [172]:
apple_health_df.groupby(["type"])["value"].count().sort_values(ascending=False)

type
ActiveEnergyBurned                355400
BasalEnergyBurned                 180106
HeartRate                         143818
DistanceWalkingRunning            127215
StepCount                          88419
AppleExerciseTime                  16205
AppleStandTime                     14607
EnvironmentalAudioExposure         10506
RespiratoryRate                     9022
FlightsClimbed                      8352
BodyMass                            8274
BodyMassIndex                       8270
WalkingSpeed                        5715
WalkingStepLength                   5713
AppleStandHour                      5590
WalkingDoubleSupportPercentage      5025
OxygenSaturation                    3602
HeadphoneAudioExposure              3527
SleepAnalysis                       3145
HeartRateVariabilitySDNN            2265
WalkingAsymmetryPercentage          2208
StairDescentSpeed                   1568
StairAscentSpeed                    1110
RestingHeartRate                     302
WalkingHear

In [173]:
data_catalogue = apple_health_df.groupby(["type"])["value"].count().sort_values(ascending=False).reset_index()
columns = data_catalogue[data_catalogue["value"] >= 35]["type"].values

In [174]:
apple_health_df.shape

(1010901, 16)

In [175]:
apple_health_df = apple_health_df[apple_health_df["type"].apply(lambda x: True if x in columns else False)].copy()
apple_health_df.shape

(1010833, 16)

* Dealing with the time columns 

In [176]:
def get_time_difference(time_1, time_2, in_terms_of="h"):
    """
    :param time_1: str in such a format 2021-10-25 10:21:39 +0300
    :param time_2: str in such a format 2021-10-25 10:21:39 +0300
    :param in_terms_of: "h" for hours "d" for days
    """
    dt_time_1 = datetime.strptime(time_1, "%Y-%m-%d %H:%M:%S %z")
    dt_time_2 = datetime.strptime(time_2, "%Y-%m-%d %H:%M:%S %z")
    difference = (dt_time_1-dt_time_2)
    
    if in_terms_of == "h":
        return int(np.floor(difference.seconds/60/60))
    elif in_terms_of == "d":
        return int(np.floor(difference.seconds/60/60/24))

def get_date(time_1, return_type="str"):
    """
    :param time_1: str in such a format 2021-10-25 10:21:39 +0300
    :param return_type: str or datetime
    """
    dt_time_1 = datetime.strptime(time_1, "%Y-%m-%d %H:%M:%S %z").date()
    if return_type == "str":
        return dt_time_1.strftime("%Y-%m-%d")
    else:
        dt_time_1

### Format dates 

In [177]:
apple_health_df["start_date"] = apple_health_df["startDate"].apply(lambda x: get_date(x))
apple_health_df["end_date"] = apple_health_df["endDate"].apply(lambda x: get_date(x))

### Make value column float 

In [178]:
apple_health_df["value"] = apple_health_df.apply(lambda x: float(x["value"]) if x["type_type"] == "continious" else x["value"],
                                                 axis=1)

### Group by and concat back 

In [179]:
apple_health_df_agg_cont = apple_health_df[apple_health_df["type_type"] == "continious"].groupby(["end_date", "type", "type_type"]).agg({"value": "sum", 
                                                                                                  "unit": set, 
                                                                                                  "sourceName": set,
                                                                                                  "device_name": set}).reset_index()
print(len(apple_health_df_agg_cont))

13417


In [180]:
apple_health_df_agg_cat= apple_health_df[apple_health_df["type_type"] == "categorical"].groupby(["end_date", "type", "type_type"]).agg({"value": list, 
                                                                                                                                        "unit": set, 
                                                                                                                                        "sourceName": set,
                                                                                                                                        "device_name": set}).reset_index()
print(len(apple_health_df_agg_cat))


723


* Format categorical columns 

In [181]:
def get_freq(a_series):
    
    instance, freq = np.unique(a_series["value"], return_counts=True)
    if a_series["type"] == "SleepAnalysis":
        return dict(zip(instance, freq)).get("HKCategoryValueSleepAnalysisInBed", 0)
    elif a_series["type"] == "AppleStandHour":
        return dict(zip(instance, freq)).get("HKCategoryValueAppleStandHourStood", 0)
        

apple_health_df_agg_cat["value"] = apple_health_df_agg_cat.apply(lambda x: get_freq(x), axis=1)

In [182]:
apple_health_final_df = pd.concat([apple_health_df_agg_cont, apple_health_df_agg_cat]).reset_index()
len(apple_health_final_df)

14140

### Order by date 

In [183]:
apple_health_final_df = apple_health_final_df.sort_values(by='end_date')

### Put into a csv

In [184]:
apple_health_final_df.to_csv("apple_health_data_ece.csv")

### References 