In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Simple Apple Health XML to CSV
==============================
:File: convert.py
:Description: Convert Apple Health "export.xml" file into a csv
:Version: 0.0.1
:Created: 2019-10-04
:Authors: Jason Meno (jam)
:Dependencies: An export.xml file from Apple Health
:License: BSD-2-Clause
"""

# %% Imports
import pandas as pd
import xml.etree.ElementTree as ET
import datetime as dt
import re
import sys


# %% Function Definitions

def pre_process(xml_string):
    """
    The export.xml file is where all your data is, but Apple Health Export has
    two main problems that make it difficult to parse: 
        1. The DTD markup syntax is exported incorrectly by Apple Health for some data types.
        2. The invisible character \x0b (sometimes rendered as U+000b) likes to destroy trees. Think of the trees!

    Knowing this, we can save the trees and pre-processes the XML data to avoid destruction and ParseErrors.
    """

    print("Pre-processing...", end="")
    sys.stdout.flush()

    xml_string = strip_dtd(xml_string)
    xml_string = strip_invisible_character(xml_string)
    print("done!")

    return xml_string


def strip_invisible_character(xml_string):

    return xml_string.replace("\x0b", "")


def strip_dtd(xml_string):
    start_strip = re.search('<!DOCTYPE', xml_string).span()[0]
    end_strip = re.search(']>', xml_string).span()[1]

    return xml_string[:start_strip] + xml_string[end_strip:]


def xml_to_csv(xml_string):
    """Loops through the element tree, retrieving all objects, and then
    combining them together into a dataframe
    """

    print("Converting XML File to CSV...", end="")
    sys.stdout.flush()

    etree = ET.ElementTree(ET.fromstring(xml_string))

    attribute_list = []

    for child in etree.getroot():
        child_attrib = child.attrib
        for metadata_entry in list(child):
            metadata_values = list(metadata_entry.attrib.values())
            if len(metadata_values) == 2:
                metadata_dict = {metadata_values[0]: metadata_values[1]}
                child_attrib.update(metadata_dict)

        attribute_list.append(child_attrib)

    health_df = pd.DataFrame(attribute_list)

    # Every health data type and some columns have a long identifer
    # Removing these for readability
    health_df.type = health_df.type.str.replace('HKQuantityTypeIdentifier', "")
    health_df.type = health_df.type.str.replace('HKCategoryTypeIdentifier', "")
    health_df.columns = \
        health_df.columns.str.replace("HKCharacteristicTypeIdentifier", "")

    # Reorder some of the columns for easier visual data review
    original_cols = list(health_df)
    shifted_cols = ['type',
                    'sourceName',
                    'value',
                    'unit',
                    'startDate',
                    'endDate',
                    'creationDate']

    # Add loop specific column ordering if metadata entries exist
    if 'com.loopkit.InsulinKit.MetadataKeyProgrammedTempBasalRate' in original_cols:
        shifted_cols.append(
            'com.loopkit.InsulinKit.MetadataKeyProgrammedTempBasalRate')

    if 'com.loopkit.InsulinKit.MetadataKeyScheduledBasalRate' in original_cols:
        shifted_cols.append(
            'com.loopkit.InsulinKit.MetadataKeyScheduledBasalRate')

    if 'com.loudnate.CarbKit.HKMetadataKey.AbsorptionTimeMinutes' in original_cols:
        shifted_cols.append(
            'com.loudnate.CarbKit.HKMetadataKey.AbsorptionTimeMinutes')

    remaining_cols = list(set(original_cols) - set(shifted_cols))
    reordered_cols = shifted_cols + remaining_cols
    health_df = health_df.reindex(labels=reordered_cols, axis='columns')

    # Sort by newest data first
    health_df.sort_values(by='startDate', ascending=False, inplace=True)

    print("done!")

    return health_df


def save_to_csv(health_df):
    print("Saving CSV file...", end="")
    sys.stdout.flush()

    today = dt.datetime.now().strftime('%Y-%m-%d')
    health_df.to_csv("apple_health_export_" + today + ".csv", index=False)
    print("done!")

    return


def main():
    xml_string = open("export.xml").read()
    xml_string = pre_process(xml_string)
    health_df = xml_to_csv(xml_string)
    save_to_csv(health_df)

    return


# %%
if __name__ == '__main__':
    main()

In [None]:
# CSVファイルを読み込む
data1 = pd.read_csv('apple_health_export_" + today + ".csv')

#タイムスタンプ列を変換する
data1['startDate'] = pd.to_datetime(data1['startDate']).dt.strftime('%Y-%m-%dT%H:%M:%S')
data1['endDate'] = pd.to_datetime(data1['endDate']).dt.strftime('%Y-%m-%dT%H:%M:%S')
data1['creationDate'] = pd.to_datetime(data1['creationDate']).dt.strftime('%Y-%m-%dT%H:%M:%S')

#変換後のデータをCSVファイルに保存する
data1.to_csv("apple_health_export_" + today + "_modified.csv", index=False)

In [None]:
import matplotlib.pyplot as plt

# CSVファイルを読み込む
data2 = pd.read_csv('apple_health_export_" + today + "_modified.csv')

# value列を数値に変換できる行のみを抽出する
numeric_data2 = data2[pd.to_numeric(data2['value'], errors='coerce').notna()]

# type列ごとにグループ化し、value列の分布を可視化する
for type_value, group_data2 in numeric_data2.groupby('type'):
    if group_data2.empty:
        continue  # グループ内のデータがない場合はスキップする
    plt.figure()
    group_data2['value'] = pd.to_numeric(group_data2['value'])  # value列を数値に変換
    group_data2['value'].plot.hist(bins=10)  # ヒストグラムを作成（適宜binsの値を調整）
    plt.title(f"Distribution of Values - Type: {type_value}")
    plt.xlabel("Value")
    plt.ylabel("Frequency")
    plt.show()

In [None]:
# type列ごとにグループ化し、value列の分布を可視化する
for type_value, group_data2 in numeric_data2.groupby('type'):
    if group_data2.empty:
        continue  # グループ内のデータがない場合はスキップする
    plt.figure()
    group_data2['value'] = pd.to_numeric(group_data2['value'])  # value列を数値に変換
    group_data2['value'].plot.hist(bins=10)  # ヒストグラムを作成（適宜binsの値を調整）
    plt.title(f"Distribution of Values - Type: {type_value}")
    plt.xlabel("Value")
    plt.ylabel("Frequency")
    plt.show()

In [None]:
import numpy as np

# type列の一意の値を取得
types = data2['type'].unique()

for type_value in types:
    # typeに対応するデータを抽出 
    type_data = data2[data2['type'] == type_value].copy()

    # 必要な列のみを抽出し、新しいデータフレームを作成
    new_data = type_data[['startDate', 'value']]

    # 列名とデータ型を変更
    new_data.columns = ['Timestamp', 'value']
    new_data['value'] = pd.to_numeric(new_data['value'], errors='coerce')

    # データ型がdoubleでない行をnullに設定
    new_data.loc[~new_data['value'].notnull(), 'value'] = np.nan

    # タイムスタンプでソート
    new_data = new_data.sort_values('Timestamp')

    # トレーニングデータとテストデータに分割
    split_index = int(len(new_data) * 0.9)
    train_data = new_data.iloc[:split_index]
    test_data = new_data.iloc[split_index:]

    # ファイル名を作成
    train_file = f"{type_value}_train.csv" 
    test_file = f"{type_value}_test.csv"

    # CSVファイルとして保存
    train_data.to_csv(train_file, index=False)
    test_data.to_csv(test_file, index=False)