In [1]:
import logging 
import pandas as pd
import boto3
import json 

In [2]:
def get_s3_keys(bucket):
    """Get a list of keys in an S3 bucket."""
    keys = []
    personal = boto3.Session(profile_name='personal')
    s3 = personal.client('s3')
    resp = s3.list_objects(Bucket=bucket)
    print(resp)
    for obj in resp['Contents']:
        keys.append(obj['Key'])
    print(keys)
    return keys

In [3]:
get_s3_keys(bucket='ntonthat-apple-health-data')

{'ResponseMetadata': {'RequestId': 'KXNYY54PHZR7N8MK', 'HostId': 'kKOa74SJgTfLEcL9yI1SXD5Aj5opOL9DtQKiHcNrAuJHNuqnMXRBfyPuR2NQvjRbyX7zrkjgKu/aCTz2pjN3tg==', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'kKOa74SJgTfLEcL9yI1SXD5Aj5opOL9DtQKiHcNrAuJHNuqnMXRBfyPuR2NQvjRbyX7zrkjgKu/aCTz2pjN3tg==', 'x-amz-request-id': 'KXNYY54PHZR7N8MK', 'date': 'Thu, 26 Jan 2023 02:43:19 GMT', 'x-amz-bucket-region': 'ap-southeast-2', 'content-type': 'application/xml', 'transfer-encoding': 'chunked', 'server': 'AmazonS3'}, 'RetryAttempts': 1}, 'IsTruncated': False, 'Marker': '', 'Contents': [{'Key': 'syncs/2023-01-18T09:39:59.479199.json', 'LastModified': datetime.datetime(2023, 1, 18, 9, 40, tzinfo=tzutc()), 'ETag': '"0d0f1154d6f64884df537e9b8a766cea"', 'Size': 73299, 'StorageClass': 'STANDARD', 'Owner': {'DisplayName': 'n.nam.tonthat', 'ID': '549df8ab0220b17c8be5925e3d728862c3523e77344f2b4e32078a6e2c4477ee'}}, {'Key': 'workouts/2023-01-18T09:39:59.828797.json', 'LastModified': datetime.datetime(202

['syncs/2023-01-18T09:39:59.479199.json',
 'workouts/2023-01-18T09:39:59.828797.json']

In [6]:
def load_json_from_s3(bucket, key):
    personal = boto3.Session(profile_name='personal')
    s3 = personal.resource('s3')
    
    obj = s3.Object(bucket, key)
    body = obj.get()['Body']
    body = body.read().decode('utf-8')
    return body

In [9]:
def create_parquets(event): 
    # s3 = boto3.resource('s3')
    personal = boto3.Session(profile_name='personal')
    s3 = personal.resource('s3')
    
    logging.info('Reading s3 event trigger')
    event = json.loads(open(f'{event}', 'r').read())
    
    key = event.get("Records")[0].get('s3').get('object').get('key')
    bucket = event.get("Records")[0].get('s3').get('bucket').get('name')
    # get_s3_keys(bucket)
    # convert contents to native python string
    json_data = load_json_from_s3(bucket, key)
    
    source_data = []
    
    for line in json_data.splitlines(): 
        source_data.append(json.loads(line))
        
    logging.info('Converting to dataframe')
    df = pd.DataFrame.from_records(source_data)
    
    # force conversion types
    df['qty'] = df['qty'].astype(str)
    
    logging.info('Converting to parquet')
    
    # write to parquet
    df.to_parquet('ahc.parquet')
    
    s3.meta.client.upload_file('ahc.parquet', 'ntonthat-apple-health-data', 'parquets/ahc.parquet')
    return 

In [10]:
create_parquets('flask/healthlake/s3-trigger-event.json')

In [67]:
import json 
import os
from dataclasses import dataclass
from datetime import datetime
from typing import Optional

In [74]:
event_trigger_path = 'flask/healthlake/s3-trigger-event.json'

In [75]:
event_trigger = json.load(open(event_trigger_path, 'r'))

In [64]:
@dataclass
class AppleHealthData:
    """Dataclass for Apple Health Data"""
    date: str
    date_updated: str
    name: str
    qty: float
    units: str
    source: Optional[str] = None
    
    def __post_init__(self): 
        self.date = datetime.strptime(self.date, '%Y-%m-%d %H:%M:%S %z')
        self.date_updated = datetime.strptime(self.date_updated, '%Y-%m-%d %H:%M:%S.%f')

In [38]:
json_path = 'flask/healthlake/2023-01-18T09:39:59.479199.json'

In [62]:
source_data = []
with open(json_path, 'r') as file: 
    for line in file: 
        source_data.append(json.loads(line))
        
pd.DataFrame.from_records(source_data)

In [66]:
for item in source_data: 
    AppleHealthData(**item)

In [95]:
df = pd.DataFrame.from_records(source_data)

In [101]:
df['qty'] = df['qty'].astype(str)

In [102]:
df.to_parquet('ahc.parquet')