In [1]:
import pandas as pd

# Define the base path for your datasets
base_path = '/Users/ngirmay/Documents/GitHub/ironman_retrospective/apple_health/health_data_exported/'

# Load each dataset, assuming comma is the separator and skipping the first row if it contains 'sep=...'
bmi_data = pd.read_csv(base_path + 'HKQuantityTypeIdentifierBodyMassIndex_2024-04-99_18-30-56_SimpleHealthExportCSV.csv', sep=',', skiprows=1)
body_fat_data = pd.read_csv(base_path + 'HKQuantityTypeIdentifierBodyFatPercentage_2024-04-99_18-30-56_SimpleHealthExportCSV.csv', sep=',', skiprows=1)
body_mass_data = pd.read_csv(base_path + 'HKQuantityTypeIdentifierBodyMass_2024-04-99_18-30-56_SimpleHealthExportCSV.csv', sep=',', skiprows=1)
lean_body_mass_data = pd.read_csv(base_path + 'HKQuantityTypeIdentifierLeanBodyMass_2024-04-99_18-30-56_SimpleHealthExportCSV.csv', sep=',', skiprows=1)

# Print the first few rows of each dataset to confirm they're loaded correctly
print("BMI Data:", bmi_data.head(), "\n")
print("Body Fat Data:", body_fat_data.head(), "\n")
print("Body Mass Data:", body_mass_data.head(), "\n")
print("Lean Body Mass Data:", lean_body_mass_data.head(), "\n")

# Now find out the correct date column name from one of the dataframes
print("BMI Data Columns:", bmi_data.columns)


BMI Data:                                     type sourceName  sourceVersion  \
0  HKQuantityTypeIdentifierBodyMassIndex     Renpho              1   
1  HKQuantityTypeIdentifierBodyMassIndex     Renpho              1   
2  HKQuantityTypeIdentifierBodyMassIndex     Renpho              2   
3  HKQuantityTypeIdentifierBodyMassIndex     Renpho              2   
4  HKQuantityTypeIdentifierBodyMassIndex     Renpho              2   

  productType  device                  startDate                    endDate  \
0  iPhone11,8     NaN  2022-08-17 01:04:50 +0000  2022-08-17 01:04:50 +0000   
1  iPhone11,8     NaN  2022-08-17 01:06:10 +0000  2022-08-17 01:06:10 +0000   
2  iPhone11,8     NaN  2022-08-29 22:32:46 +0000  2022-08-29 22:32:46 +0000   
3  iPhone11,8     NaN  2022-09-14 22:34:17 +0000  2022-09-14 22:34:17 +0000   
4  iPhone11,8     NaN  2022-09-16 02:01:59 +0000  2022-09-16 02:01:59 +0000   

    unit      value  
0  count  24.900000  
1  count  24.900000  
2  count  24.700001  
3  cou

In [2]:
# Convert date columns to datetime format for all datasets
bmi_data['startDate'] = pd.to_datetime(bmi_data['startDate'])
bmi_data['endDate'] = pd.to_datetime(bmi_data['endDate'])

body_fat_data['startDate'] = pd.to_datetime(body_fat_data['startDate'])
body_fat_data['endDate'] = pd.to_datetime(body_fat_data['endDate'])

body_mass_data['startDate'] = pd.to_datetime(body_mass_data['startDate'])
body_mass_data['endDate'] = pd.to_datetime(body_mass_data['endDate'])

lean_body_mass_data['startDate'] = pd.to_datetime(lean_body_mass_data['startDate'])
lean_body_mass_data['endDate'] = pd.to_datetime(lean_body_mass_data['endDate'])

# Function to print basic EDA
def basic_eda(data, name):
    print(f"\n{name} - First 5 rows:")
    print(data.head())

    print(f"\n{name} - Summary Statistics:")
    print(data.describe())

    print(f"\n{name} - Missing Values:")
    print(data.isnull().sum())

# Perform EDA for each dataset
basic_eda(bmi_data, "BMI Data")
basic_eda(body_fat_data, "Body Fat Data")
basic_eda(body_mass_data, "Body Mass Data")
basic_eda(lean_body_mass_data, "Lean Body Mass Data")


BMI Data - First 5 rows:
                                    type sourceName  sourceVersion  \
0  HKQuantityTypeIdentifierBodyMassIndex     Renpho              1   
1  HKQuantityTypeIdentifierBodyMassIndex     Renpho              1   
2  HKQuantityTypeIdentifierBodyMassIndex     Renpho              2   
3  HKQuantityTypeIdentifierBodyMassIndex     Renpho              2   
4  HKQuantityTypeIdentifierBodyMassIndex     Renpho              2   

  productType  device                 startDate                   endDate  \
0  iPhone11,8     NaN 2022-08-17 01:04:50+00:00 2022-08-17 01:04:50+00:00   
1  iPhone11,8     NaN 2022-08-17 01:06:10+00:00 2022-08-17 01:06:10+00:00   
2  iPhone11,8     NaN 2022-08-29 22:32:46+00:00 2022-08-29 22:32:46+00:00   
3  iPhone11,8     NaN 2022-09-14 22:34:17+00:00 2022-09-14 22:34:17+00:00   
4  iPhone11,8     NaN 2022-09-16 02:01:59+00:00 2022-09-16 02:01:59+00:00   

    unit      value  
0  count  24.900000  
1  count  24.900000  
2  count  24.700001  
3 