# MongodB - data exploration, part 2

This time I'm just starting to organize things... Getting time into pandas datetime format so I can work with it more easily and start chipping away at pulling carb ratios out of the mongodb at a requested time.

In [224]:
import sys
sys.path.append("../")

from mdb_tools import load_data as ld
from mdb_tools import stats 
from mdb_tools import schemas
from pymongoarrow.api import Schema
import pyarrow as pa

import numpy as np
import pandas as pd

In [2]:
yml_secrets_file = '../../secrets/mdb_secrets.yml'

# Access the database using the yml secrets file, and get a specific set of "collections"
col_entries, col_treatments, col_profile, col_devicestatus = ld.get_collections(yml_secrets_file)

# Grab schemas
entries_schema, treatments_schema, devicestatus_schema = schemas.mdb_schemas() 

# Load 
df_entries = col_entries.find_pandas_all({}, schema=entries_schema)
df_treatments = col_treatments.find_pandas_all({}, schema=treatments_schema)
df_devicestatus = col_devicestatus.find_pandas_all({}, schema=devicestatus_schema)

In [6]:
# Convert the time string to datetime format 
df_entries["time"] = pd.to_datetime(df_entries["dateString"])

# Make a new column that has year-day as strings
df_entries["yearday"] = stats.get_yeardays(df_entries["time"])

df_entries

Unnamed: 0,sgv,dateString,time,yearday
0,182.0,2023-03-13T16:43:43.000Z,2023-03-13 16:43:43+00:00,2023-072
1,160.0,2023-03-13T16:13:43.000Z,2023-03-13 16:13:43+00:00,2023-072
2,172.0,2023-03-13T16:23:43.000Z,2023-03-13 16:23:43+00:00,2023-072
3,154.0,2023-03-13T15:58:43.000Z,2023-03-13 15:58:43+00:00,2023-072
4,157.0,2023-03-13T16:03:43.000Z,2023-03-13 16:03:43+00:00,2023-072
...,...,...,...,...
53787,229.0,2023-10-22T15:54:30.000Z,2023-10-22 15:54:30+00:00,2023-295
53788,222.0,2023-10-22T15:59:31.000Z,2023-10-22 15:59:31+00:00,2023-295
53789,222.0,2023-10-22T15:59:30.000Z,2023-10-22 15:59:30+00:00,2023-295
53790,225.0,2023-10-22T16:04:32.000Z,2023-10-22 16:04:32+00:00,2023-295


## Getting carb ratios

Extracting data about carb ratios is a little more complicated because any given document in the profile data collection has one time stamp but one or more carb ratios. Next I'm creating a function that accepts any time stamp and returns the active carb ratio at that time. 

To do this, I'll need to find the most recent previous profile entry and then look at the carb ratio that would be relevant for that time of day. 


In [484]:
# Convert the date to unix format. I'm doing this to make date/time comparisons easier (work with integers rather than strings)
df_entries["time_unix"] = df_entries["time"].view("int64")

in_times_2 = df_entries["time"].iloc[53687:]


In [479]:
def get_setting_at_times(in_times, col_prof, req_profile = "Default"):
    """
    A function that returns the carb ratio from a profile collection at requested times
    
    Args:
        in_time (array-like): An array of input / requested times in pandas timestamp format
        col_prof (mongodb collection): profile collection that includes the carb ratios
        req_profile (str): requested profile name. Default is "Default".
        
    Returns: (float) Carb ratio at in_time
    
    """
    # ##### Prep time variables #####
    
    # Convert in_times array to series (if it isn't already)
    in_times = pd.Series(in_times)
    
    # Convert requested input times to unix
    in_times_unix = in_times.view('int64')
    
    # Compute seconds elapsed in current day (to get at the current carb ratio)
    seconds_in_day = in_times.dt.hour*3600 + in_times.dt.minute*60 + in_times.dt.second

    # ##### Prep profile documents #####
    
    # Dump every profile document into a list
    prof_docs_all = [prof for prof in col_profile.find({})]

    # Get all the time stamps and convert to unix time (nanoseconds)
    prof_time_unix = [int(prof["mills"])*1e6 for prof in prof_docs_all]

    # Get all the "store" items in each document
    all_store = [doc["store"] for doc in prof_docs_all]
    
    # Pull out all the keys for each "store" 
    store_keys = [x.keys() for x in all_store]

    # Extract the "store" info for each document, for any available profiles
    all_store_vals = [[doc[key] for key in store_keys[doc_num]] for doc_num, doc in enumerate(all_store)]

    # Extract the profile names for each document
    all_store_profiles = [[key for key in store_keys[doc_num]] for doc_num, doc in enumerate(all_store)]

    # ##### Next, figure out which profile is appropriate for each requested time #####

    # Get the indices of the most "recent" documents from the collection. Only keep the ones for the requested profile.
    doc_idx = [(len([in_time-t for t_idx, t in enumerate(prof_time_unix) if (in_time-t >= 0) & (req_profile in all_store_profiles[t_idx])]) - 1) for in_time in in_times_unix]

    # Profile info for each requested time
    requested_prof_info = [[all_store[this_doc][prof] for prof in all_store_profiles[this_doc] ] for this_doc in doc_idx]

    val_req = []
    for idx, prof_info in enumerate(requested_prof_info):
        carb_ratio = prof_info[0]["carbratio"]
        this_req_time = seconds_in_day.iloc[idx]

        times = [cr["timeAsSeconds"] for cr in carb_ratio]
        crs = [cr["value"] for cr in carb_ratio]

        t_diff = this_req_time-times

        val_req.append([crs[idx] for idx, x in enumerate(t_diff) if x>0][-1])

    return val_req

In [475]:
pd.Series(in_times)
pd.Series(np.asarray(in_times))

0   2023-10-13 16:43:54+00:00
1   2023-10-13 16:48:54+00:00
2   2023-10-13 16:48:53+00:00
dtype: datetime64[ns, UTC]

In [225]:
# prof_time = [int(pd.Timestamp(prof["startDate"]).timestamp())*1e9 for prof in col_profile.find({})]
prof_ms_time = [int(prof["mills"])*1e6 for prof in col_profile.find({})]



# Get the index of the most "recent" document from the collection
t_diff = [in_time-t for t in prof_ms_time]
current_prof_doc = np.argmin(t_diff)




113

In [383]:
# Dump every profile document into a list
prof_docs_all = [prof for prof in col_profile.find({})]

# Get all the time stamps and convert to unix time (nanoseconds)
prof_time_unix = [int(prof["mills"])*1e6 for prof in prof_docs_all]

# Get all the "store" items in each document
all_store = [doc["store"] for doc in prof_docs_all]

# # Extract JUST the profile info for each document, regardless of profile
# all_store_vals = [[doc[key] for key in store_keys[doc_num]] for doc_num, doc in enumerate(all_store)]

# # Extract the profile names for each document
# all_store_profiles = [[key for key in store_keys[doc_num]] for doc_num, doc in enumerate(all_store)]


# Extract JUST the profile info for each document, regardless of profile
all_store_vals = [[doc[key] for key in store_keys[doc_num] if key == req_profile] for doc_num, doc in enumerate(all_store)]

# Extract the profile names for each document
all_store_profiles = [[key for key in store_keys[doc_num] if key == req_profile] for doc_num, doc in enumerate(all_store)]




In [401]:
t_idx = 50
t = prof_time_unix[t_idx]
req_profile in all_store_profiles[t_idx]

True

In [431]:

# ##### Next, figure out which profile is appropriate for each requested time

# Get the indices of the most "recent" documents from the collection. Only keep the ones for the requested profile name.
doc_idx = [(len([in_time-t for t_idx, t in enumerate(prof_time_unix) if (in_time-t >= 0) & (req_profile in all_store_profiles[t_idx])]) - 1) for in_time in in_times_unix]

# Profile info for each requested time
requested_prof_info = [[all_store[this_doc][prof] for prof in all_store_profiles[this_doc] ] for this_doc in doc_idx]

    

In [439]:
# function arg in_time
#in_time = df_entries["time_unix"].iloc[50000]
#in_times = df_entries["time_unix"].iloc[50000:50005]
in_times = df_entries["time"].iloc[50000:50003]
in_times_unix = in_times.view('int64')

seconds_in_day = in_times.dt.hour*3600 + in_times.dt.minute*60 + in_times.dt.second
seconds_in_day

50000    60234
50001    60534
50002    60533
Name: time, dtype: int32

In [462]:
requested_prof_info = [[all_store[this_doc][prof] for prof in all_store_profiles[this_doc] 
                        if prof == req_profile ] for this_doc in doc_idx]

val_req = []
for idx, prof_info in enumerate(requested_prof_info):
    carb_ratio = prof_info[0]["carbratio"]
    this_req_time = seconds_in_day.iloc[idx]
    
    times = [cr["timeAsSeconds"] for cr in carb_ratio]
    crs = [cr["value"] for cr in carb_ratio]

    t_diff = this_req_time-times

    val_req.append([crs[idx] for idx, x in enumerate(t_diff) if x>0][-1])


In [None]:
def get_daily_setting(req_prof_info, req_prof_setting, req_time):
    """
    Get either the carb ratio, insulin sensitivity, or basal rate at a particular time
    """

In [463]:
val_req

[10, 10, 10]

In [434]:
# Get the indices of the most "recent" documents from the collection. Only keep the ones for the requested profile.
doc_idx = [(len([in_time-t for t_idx, t in enumerate(prof_time_unix) if (in_time-t >= 0) & (req_profile in all_store_profiles[t_idx])]) - 1) for in_time in in_times_unix]


# Profile info for each requested time
# requested_prof_info = [[all_store[this_doc][prof] for prof in all_store_profiles[this_doc] ] for this_doc in doc_idx]
requested_prof_info = [[all_store[this_doc][prof] for prof in all_store_profiles[this_doc] if prof == req_profile ] for this_doc in doc_idx]

requested_prof_info
    

[[{'carbratio': [{'time': '00:00', 'timeAsSeconds': 0, 'value': 15},
    {'value': 12, 'timeAsSeconds': 19800, 'time': '05:30'},
    {'time': '11:00', 'timeAsSeconds': 39600, 'value': 15},
    {'timeAsSeconds': 57600, 'value': 10, 'time': '16:00'},
    {'value': 15, 'timeAsSeconds': 72000, 'time': '20:00'}],
   'carbs_hr': '0',
   'target_high': [{'time': '00:00', 'timeAsSeconds': 0, 'value': 115}],
   'units': 'mg/dL',
   'timezone': 'ETC/GMT+4',
   'sens': [{'value': 190, 'time': '00:00', 'timeAsSeconds': 0},
    {'time': '06:00', 'value': 150, 'timeAsSeconds': 21600},
    {'time': '12:00', 'timeAsSeconds': 43200, 'value': 215},
    {'time': '18:00', 'value': 165, 'timeAsSeconds': 64800}],
   'delay': '0',
   'target_low': [{'value': 100, 'time': '00:00', 'timeAsSeconds': 0}],
   'dia': 6,
   'basal': [{'time': '00:00', 'value': 0.35, 'timeAsSeconds': 0},
    {'value': 0.25, 'time': '02:30', 'timeAsSeconds': 9000},
    {'time': '06:00', 'timeAsSeconds': 21600, 'value': 0.35},
    {'v

In [433]:
in_times_unix


50000    1697215434000000000
50001    1697215734000000000
50002    1697215733000000000
Name: time, dtype: int64