In [39]:
import os, sys, time, random
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
from matplotlib import pyplot as plt
import glob
import re
print(os.getcwd())
import zipfile
import gzip

/Users/jan/git/nudgebg/egvinsulin/notebooks


In [27]:
def read_all_zips(data_dir):
    """get paths for fully numerically named zip archives in path."""
    file_paths = glob.glob(str(data_dir) + "/*.zip")
    return [path for path in file_paths if re.search("^\d+\.zip$", os.path.basename(path)) ]

def getBasePath(archive, origin_id):
    base_path = f"direct-sharing-{origin_id}/"
    patient_id = os.path.basename(archive.filename).split('.')[0]
    if patient_id == archive.filelist[0].filename.split('/')[0]:
        base_path = f"{patient_id}/{base_path}"
    return base_path if base_path in archive.namelist() else None

def getGZs(archive, dataType='treatments'):
    base_path = getBasePath(archive,31)
    files = archive.namelist()
    files = [path for path in files if re.search(f"^{base_path}({dataType})([^/]*)\.gz$", path)]
    return None if len(files)==0 else files

def getNightscoutDataGZipPaths(archive):
    return {'treatments':getGZs(archive,'treatments'),
            'entries':getGZs(archive,'entries'),
            'profile':getGZs(archive,'profile'),
            'devicestatus':getGZs(archive,'devicestatus')}
def has_none_values(data, keys):
  """
  Checks if a dictionary has None values for a set of keys.

  Args:
      data (dict): The dictionary to check.
      keys (list): A list of keys to check for None values.

  Returns:
      bool: True if all values for the keys are None, False otherwise.
  """

  # Check if any of the key values are None
  return not all(data.get(key) is not None for key in keys)

from functools import reduce
import operator
import gzip
import json

def read_gz_json_to_dict(file_handle):
  """
  Reads a JSON file from a gzipped archive and returns it as a dictionary.

  Args:
      file_handle: handle to a file object of a .gz zipped file

  Returns:
      dict: The parsed JSON data as a dictionary.

  Raises:
      FileNotFoundError: If the file is not found.
      json.JSONDecodeError: If the JSON data is invalid.
  """

  # Open the archive in binary read mode ("rb")
  with gzip.open(file_handle, "rb") as f:
    # Read the decompressed data
    data = f.read().decode("utf-8")

  # Parse the JSON data into a dictionary
  try:
    json_dict = json.loads(data)
  except json.JSONDecodeError as e:
    raise json.JSONDecodeError(f"Error decoding JSON from {filename}: {e}") from e

  return json_dict

def convertToKeyPath(array):
    return reduce(lambda x,y: x+'.'+y,array)

# Load all zip paths, get base paths 

In [271]:
data_path = os.path.join(os.getcwd(),'..','data','raw','open human')
zip_paths = read_all_zips(data_path)
df = pd.DataFrame({'archive':[os.path.basename(path) for path in zip_paths],
                   'zip_paths':zip_paths})

df["openAPS"] = df.zip_paths.apply(lambda x: getBasePath(zipfile.ZipFile(x),396))
df["nightscout"] = df.zip_paths.apply(lambda x: getBasePath(zipfile.ZipFile(x),31))
display(df.head(10))

df_ns = df.loc[~df.nightscout.isna()].copy()
df_aps = df.loc[~df.openAPS.isna()].copy()

display(df_ns.head(3))
display(df_aps.head(3))

Unnamed: 0,archive,zip_paths,openAPS,nightscout
0,66836068.zip,/Users/jan/git/nudgebg/egvinsulin/notebooks/.....,,direct-sharing-31/
1,85078542.zip,/Users/jan/git/nudgebg/egvinsulin/notebooks/.....,direct-sharing-396/,
2,51622121.zip,/Users/jan/git/nudgebg/egvinsulin/notebooks/.....,,
3,20777653.zip,/Users/jan/git/nudgebg/egvinsulin/notebooks/.....,direct-sharing-396/,
4,27553507.zip,/Users/jan/git/nudgebg/egvinsulin/notebooks/.....,direct-sharing-396/,
5,24587372.zip,/Users/jan/git/nudgebg/egvinsulin/notebooks/.....,,direct-sharing-31/
6,28756888.zip,/Users/jan/git/nudgebg/egvinsulin/notebooks/.....,,direct-sharing-31/
7,86298207.zip,/Users/jan/git/nudgebg/egvinsulin/notebooks/.....,direct-sharing-396/,
8,97099623.zip,/Users/jan/git/nudgebg/egvinsulin/notebooks/.....,direct-sharing-396/,
9,07613176.zip,/Users/jan/git/nudgebg/egvinsulin/notebooks/.....,07613176/direct-sharing-396/,07613176/direct-sharing-31/


Unnamed: 0,archive,zip_paths,openAPS,nightscout
0,66836068.zip,/Users/jan/git/nudgebg/egvinsulin/notebooks/.....,,direct-sharing-31/
5,24587372.zip,/Users/jan/git/nudgebg/egvinsulin/notebooks/.....,,direct-sharing-31/
6,28756888.zip,/Users/jan/git/nudgebg/egvinsulin/notebooks/.....,,direct-sharing-31/


Unnamed: 0,archive,zip_paths,openAPS,nightscout
1,85078542.zip,/Users/jan/git/nudgebg/egvinsulin/notebooks/.....,direct-sharing-396/,
3,20777653.zip,/Users/jan/git/nudgebg/egvinsulin/notebooks/.....,direct-sharing-396/,
4,27553507.zip,/Users/jan/git/nudgebg/egvinsulin/notebooks/.....,direct-sharing-396/,


## Check if .gz archives exist for all nightscout collections 

In [272]:
Collections = ['treatments','entries','profile','devicestatus']
df_ns['gzPaths'] = [getNightscoutDataGZipPaths(zipfile.ZipFile(zp)) for zp in df_ns.zip_paths]
df_ns['allGZExist'] = [not has_none_values(d, Collections) for d in df_ns.gzPaths]
print(f"{(~df_ns.allGZExist).sum()} out of {df_ns.shape[0]} archives miss .gz archives for at least one one nightscout collection")


1 out of 33 archives miss .gz archives for at least one one nightscout collection


## Check for Loop and OpenAPS data

We know, for example that 99296581_devicestatus.json contains both loop and openaps data

```{[...],"device": "91SCH7",[...], "timestamp": "2017-04-29T03:43:13.000Z"}, [...], "openaps": {"enacted": {"IOB": 2.795, [...]```
and shortly after we have the next containing loop data
```{"device": "CA29IY", "loop": {"version": "1.3.2dev", "iob": [...], "created_at": "2017-04-29T03:45:24Z"}```
The different device id suggests that the user might have used different devices simultaneously 

In [304]:
i_user = 0
zip_path = df_ns.zip_paths.iloc[i_user]
print(df_ns.archive.iloc[i_user])
ziparchive= zipfile.ZipFile(zip_path)
device_status_path = getGZs(ziparchive, 'devicestatus')[0]
f = ziparchive.open(device_status_path, mode="r")
device_status = read_gz_json_to_dict(f)
device_status = [benedict(d) for d in device_status]

66836068.zip


In [305]:
bLoop = np.array(['loop' in d for d in device_status])
bOpenAPS = np.array(['openaps' in d for d in device_status])

if np.sum(bLoop):
    print(f"{100*np.sum(bLoop)/len(bLoop):.0f}% device_satuses contain loop data. For example:\n {device_status[np.argwhere(bLoop).flatten()[0]]}\n")

if np.sum(bOpenAPS):
    print(f"{100*np.sum(bOpenAPS)/len(bOpenAPS):.0f}% device_satuses contain loop data. For example:\n {device_status[np.argwhere(bOpenAPS).flatten()[0]]}\n")


21% device_satuses contain loop data. For example:
 {'_id': '5ab443027bca660013280a8f', 'created_at': '2018-03-22T23:57:54Z', 'device': '91V6GE', 'loop': {'iob': {'timestamp': '2018-03-22T23:55:00Z', 'iob': 1.8159685708363793}, 'cob': {'timestamp': '2018-03-22T23:55:00Z', 'cob': 0}, 'recommendedBolus': 0, 'version': '1.5.3dev', 'predicted': {'startDate': '2018-03-22T23:49:25Z', 'values': [146, 142, 139, 136, 132, 129, 127, 124, 121, 119, 116, 114, 112, 110, 108, 106, 104, 103, 101, 100, 99, 97, 96, 95, 94, 93, 93, 92, 91, 91, 90, 90, 89, 89, 89, 89, 88, 88, 88, 88, 88, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 87, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86]}, 'name': 'BGTargets (5.2:5.2) | EvBG 4.8 | Loop2', 'enacted': {'timestamp': '2018-03-22T23:57:54Z', 'rate': 0.85, 'received': True, 'duration': 30}, 'timestamp': '2018-03-22T23:57:54Z'}}

59% device_satuses contain loop data. For example:
 {'uploader': {'batteryVoltage': 381

## Check for data availability
let's check if all profile entries have the defaultProfile, basal and loopSettings 

In [302]:
df_ns.archive.iloc[0]

'66836068.zip'

In [306]:
profile_path = getGZs(ziparchive, 'profile')[0]
f = ziparchive.open(profile_path, mode="r")
profile = read_gz_json_to_dict(f)
profile = [benedict(d) for d in profile]

bNoProfile = np.array(['defaultProfile' not in d for d in profile])
bNoDefaultBasal = np.array(['store.Default.basal' not in d for d in profile])
bNoLoopSettings = np.array(['loopSettings' not in d for d in profile])

if np.sum(bNoProfile):
    print(f"{100*np.sum(bNoProfile)/len(bNoProfile):.0f}% profile entries miss the defaultProfile string. For example:\n {profile[np.argwhere(bNoProfile).flatten()[0]]}\n")
if np.sum(bNoProfile):
    print(f"{100*np.sum(bNoDefaultBasal)/len(bNoDefaultBasal):.0f}% profile entries miss the store.Default.basal string. For example:\n {profile[np.argwhere(bNoDefaultBasal).flatten()[0]]}\n")
if np.sum(bNoProfile):
    print(f"{100*np.sum(bNoLoopSettings)/len(bNoLoopSettings):.0f}% profile entries miss the loopSettings string. For example:\n {profile[np.argwhere(bNoLoopSettings).flatten()[0]]}\n")


## Let's check for another user
This user has profile entries that don't have the same structure

In [313]:
i_user = 9
zip_path = df_ns.zip_paths.iloc[i_user]
print(df_ns.archive.iloc[i_user])
ziparchive= zipfile.ZipFile(zip_path)
profile_path = getGZs(ziparchive, 'profile')[0]
f = ziparchive.open(profile_path, mode="r")
profile = read_gz_json_to_dict(f)
profile = [benedict(d) for d in profile]

bNoProfile = np.array(['defaultProfile' not in d for d in profile])
bNoDefaultBasal = np.array(['store.Default.basal' not in d for d in profile])
bNoLoopSettings = np.array(['loopSettings' not in d for d in profile])

if np.sum(bNoProfile):
    print(f"{100*np.sum(bNoProfile)/len(bNoProfile):.0f}% profile entries miss the defaultProfile string. For example:\n {profile[np.argwhere(bNoProfile).flatten()[0]]}\n")
if np.sum(bNoProfile):
    print(f"{100*np.sum(bNoDefaultBasal)/len(bNoDefaultBasal):.0f}% profile entries miss the store.Default.basal string. For example:\n {profile[np.argwhere(bNoDefaultBasal).flatten()[0]]}\n")
if np.sum(bNoProfile):
    print(f"{100*np.sum(bNoLoopSettings)/len(bNoLoopSettings):.0f}% profile entries miss the loopSettings string. For example:\n {profile[np.argwhere(bNoLoopSettings).flatten()[0]]}\n")


00897741.zip
11% profile entries miss the defaultProfile string. For example:
 {'delay': '20', '_id': '572aab4066cf0b1c50bc76fb', 'created_at': '2016-05-05T02:09:04.076Z', 'sens': [{'value': '50', 'time': '00:00'}, {'value': '40', 'time': '07:00'}], 'dia': '2.5', 'startDate': '2016-05-05T01:44:00.000Z', 'timezone': 'US/Pacific', 'basal': [{'value': '0.35', 'time': '00:00'}, {'value': '0.45', 'time': '02:30'}, {'value': '0.35', 'time': '09:30'}, {'value': '0.35', 'time': '11:30'}, {'value': '0.35', 'time': '19:00'}], 'units': 'mg/dl', 'target_high': [{'value': '130', 'time': '00:00'}], 'target_low': [{'value': '70', 'time': '00:00'}], 'carbs_hr': 'NaN', 'carbratio': [{'value': '15', 'time': '00:00'}, {'value': '8', 'time': '07:30'}, {'value': '13', 'time': '09:30'}]}

78% profile entries miss the store.Default.basal string. For example:
 {'defaultProfile': 'Autosense 2nd run in December ', '_id': '589164177283250c00e43b60', 'created_at': '2017-02-01T04:29:25.600Z', 'mills': '14859231000

## Looking at Basal Rates

In [320]:
i_user = 0
zip_path = df_ns.zip_paths.iloc[i_user]
print(df_ns.archive.iloc[i_user])
ziparchive= zipfile.ZipFile(zip_path)
profile_path = getGZs(ziparchive, 'profile')[0]
f = ziparchive.open(profile_path, mode="r")
profile = read_gz_json_to_dict(f)
profile = [benedict(d) for d in profile]

basal_start_dates = [p['startDate'] for p in profile]
basal_rates = [p[convertToKeyPath(['store',p['defaultProfile'],'basal'])] for p in profile]
basal_rates

66836068.zip


[[{'value': 1.2, 'time': '00:00', 'timeAsSeconds': 0},
  {'value': 1.1, 'time': '03:00', 'timeAsSeconds': 10800},
  {'value': 1.2, 'time': '10:00', 'timeAsSeconds': 36000},
  {'value': 1.2, 'time': '13:00', 'timeAsSeconds': 46800},
  {'value': 1.2, 'time': '18:00', 'timeAsSeconds': 64800},
  {'value': 1.35, 'time': '19:00', 'timeAsSeconds': 68400}],
 [{'value': 1.2, 'time': '00:00', 'timeAsSeconds': 0},
  {'value': 1.1, 'time': '03:00', 'timeAsSeconds': 10800},
  {'value': 1.2, 'time': '10:00', 'timeAsSeconds': 36000},
  {'value': 1.2, 'time': '13:00', 'timeAsSeconds': 46800},
  {'value': 1.2, 'time': '18:00', 'timeAsSeconds': 64800},
  {'value': 1.35, 'time': '19:00', 'timeAsSeconds': 68400}],
 [{'value': 1.2, 'time': '00:00', 'timeAsSeconds': 0},
  {'value': 1.1, 'time': '03:00', 'timeAsSeconds': 10800},
  {'value': 1.2, 'time': '10:00', 'timeAsSeconds': 36000},
  {'value': 1.2, 'time': '13:00', 'timeAsSeconds': 46800},
  {'value': 1.2, 'time': '18:00', 'timeAsSeconds': 64800},
  {'v