In [1]:
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import os

In [2]:
# Load the data
df = pd.read_csv(os.path.join('..', 'data', 'raw', 'train.csv'), na_values=np.nan, low_memory=False)

In [3]:
def get_time() -> str:
    return datetime.now().strftime("%H:%M:%S")

In [4]:
def parse_time_diff(time_diff_str: str) -> timedelta:
    # get a string in the format of -HH:MM and return a timedelta object
    is_negative = time_diff_str[0] == '-'

    if time_diff_str[0] in ['+', '-']:
        time_diff_str = time_diff_str[1:]

    hours, minutes = time_diff_str.split(':')
    if is_negative:
        hours = -int(hours)
        minutes = -int(minutes)

    return timedelta(hours=int(hours), minutes=int(minutes))

In [5]:
df["time"] = pd.to_datetime(df["time"], format="%H:%M:%S").dt.time

In [6]:
p_nums = df['p_num'].unique()
parameters = ['bg', 'insulin', 'carbs', 'hr', 'steps', 'cals', 'activity']
time_diffs = [
    '-0:00',
    '-0:05',
    '-0:10',
    '-0:15',
    '-0:20',
    '-0:25',
    '-0:30',
    '-0:35',
    '-0:40',
    '-0:45',
    '-0:50',
    '-0:55',
    '-1:00',
    '-1:05',
    '-1:10',
    '-1:15',
    '-1:20',
    '-1:25',
    '-1:30',
    '-1:35',
    '-1:40',
    '-1:45',
    '-1:50',
    '-1:55',
    '-2:00',
    '-2:05',
    '-2:10',
    '-2:15',
    '-2:20',
    '-2:25',
    '-2:30',
    '-2:35',
    '-2:40',
    '-2:45',
    '-2:50',
    '-2:55',
    '-3:00',
    '-3:05',
    '-3:10',
    '-3:15',
    '-3:20',
    '-3:25',
    '-3:30',
    '-3:35',
    '-3:40',
    '-3:45',
    '-3:50',
    '-3:55',
    '-4:00',
    '-4:05',
    '-4:10',
    '-4:15',
    '-4:20',
    '-4:25',
    '-4:30',
    '-4:35',
    '-4:40',
    '-4:45',
    '-4:50',
    '-4:55',
    '-5:00',
    '-5:05',
    '-5:10',
    '-5:15',
    '-5:20',
    '-5:25',
    '-5:30',
    '-5:35',
    '-5:40',
    '-5:45',
    '-5:50',
    '-5:55'
]
start_date = datetime(2000, 1, 1)

In [7]:
# add date_time index
date_times = []
last_processed_time = None
last_processed_p_num = None
current_date = None

for i, row in df.iterrows():
    # reset when we see a new patient
    if last_processed_p_num is not None and last_processed_p_num != row["p_num"]:
        last_processed_p_num = None
        last_processed_time = None
        current_date = None

    if last_processed_p_num is None:
        last_processed_p_num = row["p_num"]
    if last_processed_time is None:
        last_processed_time = row["time"]
    if current_date is None:
        current_date = start_date

    if row["time"] < last_processed_time:
        current_date = current_date + timedelta(days=1)

    date_times.append(datetime.combine(current_date, row["time"]))
    last_processed_time = row["time"]

df = df.copy()
df.loc[:, "datetime"] = date_times
df = df.set_index("datetime")
df = df.drop(columns=["time"])

In [8]:
df.head()

Unnamed: 0_level_0,id,p_num,bg-5:55,bg-5:50,bg-5:45,bg-5:40,bg-5:35,bg-5:30,bg-5:25,bg-5:20,...,activity-0:40,activity-0:35,activity-0:30,activity-0:25,activity-0:20,activity-0:15,activity-0:10,activity-0:05,activity-0:00,bg+1:00
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-01 06:10:00,p01_0,p01,,,9.6,,,9.7,,,...,,,,,,,,,,13.4
2000-01-01 06:25:00,p01_1,p01,,,9.7,,,9.2,,,...,,,,,,,,,,12.8
2000-01-01 06:40:00,p01_2,p01,,,9.2,,,8.7,,,...,,,,,,,,,,15.5
2000-01-01 06:55:00,p01_3,p01,,,8.7,,,8.4,,,...,,,,,,,,,,14.8
2000-01-01 07:10:00,p01_4,p01,,,8.4,,,8.1,,,...,,,,,,,,,,12.7


In [9]:
result_dict = {}
for p_num in p_nums:
    result_dict[p_num] = {}
    for parameter in parameters:
        df_patient = df[df['p_num'] == p_num]
        new_df = df_patient[['id', 'p_num', f'{parameter}-0:00']].copy()
        for time_diff_id, time_diff_str in enumerate(time_diffs):
            if time_diff_str == '-0:00':
                continue

            if not df_patient.columns.str.contains(f"{parameter}{time_diff_str}").any():
                continue

            time_diff = parse_time_diff(time_diff_str)
            values = df_patient[f"{parameter}{time_diff_str}"].copy()
            values.index = values.index + time_diff
            values.index.name = "datetime"

            # add rows that are not in df
            new_df = new_df.reindex(df_patient.index.union(values.index))

            # join the values to the new_df
            new_df = new_df.join(values.rename(f"{parameter}{time_diff_str}+{time_diff_str}"), on="datetime")

        # analyse the data
        # check for unique values in the columns except for the id and p_num
        new_df = new_df.set_index("id", drop=True)
        unique_values = new_df.drop(columns=["p_num"]).nunique(axis=1)
        unique_values = unique_values[unique_values > 1]

        result_dict[p_num][parameter] = {
            'consistent': new_df.shape[0] - unique_values.shape[0],
            'inconsistent': unique_values.shape[0]
        }

In [10]:
result = pd.DataFrame.from_dict(result_dict, columns=parameters, orient='index')
result

Unnamed: 0,bg,insulin,carbs,hr,steps,cals,activity
p01,"{'consistent': 16865, 'inconsistent': 0}","{'consistent': 16865, 'inconsistent': 0}","{'consistent': 16865, 'inconsistent': 0}","{'consistent': 16865, 'inconsistent': 0}","{'consistent': 16865, 'inconsistent': 0}","{'consistent': 16865, 'inconsistent': 0}","{'consistent': 16865, 'inconsistent': 0}"
p02,"{'consistent': 26335, 'inconsistent': 0}","{'consistent': 26335, 'inconsistent': 0}","{'consistent': 26335, 'inconsistent': 0}","{'consistent': 26335, 'inconsistent': 0}","{'consistent': 26335, 'inconsistent': 0}","{'consistent': 26335, 'inconsistent': 0}","{'consistent': 26335, 'inconsistent': 0}"
p03,"{'consistent': 26427, 'inconsistent': 0}","{'consistent': 26427, 'inconsistent': 0}","{'consistent': 26427, 'inconsistent': 0}","{'consistent': 26427, 'inconsistent': 0}","{'consistent': 26427, 'inconsistent': 0}","{'consistent': 26427, 'inconsistent': 0}","{'consistent': 26427, 'inconsistent': 0}"
p04,"{'consistent': 25047, 'inconsistent': 0}","{'consistent': 25047, 'inconsistent': 0}","{'consistent': 25047, 'inconsistent': 0}","{'consistent': 25047, 'inconsistent': 0}","{'consistent': 25047, 'inconsistent': 0}","{'consistent': 25047, 'inconsistent': 0}","{'consistent': 25047, 'inconsistent': 0}"
p05,"{'consistent': 16248, 'inconsistent': 0}","{'consistent': 16248, 'inconsistent': 0}","{'consistent': 16248, 'inconsistent': 0}","{'consistent': 16248, 'inconsistent': 0}","{'consistent': 16248, 'inconsistent': 0}","{'consistent': 16248, 'inconsistent': 0}","{'consistent': 16248, 'inconsistent': 0}"
p06,"{'consistent': 16674, 'inconsistent': 0}","{'consistent': 16674, 'inconsistent': 0}","{'consistent': 16674, 'inconsistent': 0}","{'consistent': 16674, 'inconsistent': 0}","{'consistent': 16674, 'inconsistent': 0}","{'consistent': 16674, 'inconsistent': 0}","{'consistent': 16674, 'inconsistent': 0}"
p10,"{'consistent': 25874, 'inconsistent': 0}","{'consistent': 25874, 'inconsistent': 0}","{'consistent': 25874, 'inconsistent': 0}","{'consistent': 25874, 'inconsistent': 0}","{'consistent': 25874, 'inconsistent': 0}","{'consistent': 25874, 'inconsistent': 0}","{'consistent': 25874, 'inconsistent': 0}"
p11,"{'consistent': 25205, 'inconsistent': 0}","{'consistent': 25205, 'inconsistent': 0}","{'consistent': 25205, 'inconsistent': 0}","{'consistent': 25137, 'inconsistent': 68}","{'consistent': 25179, 'inconsistent': 26}","{'consistent': 25160, 'inconsistent': 45}","{'consistent': 25205, 'inconsistent': 0}"
p12,"{'consistent': 26048, 'inconsistent': 0}","{'consistent': 26048, 'inconsistent': 0}","{'consistent': 26048, 'inconsistent': 0}","{'consistent': 26048, 'inconsistent': 0}","{'consistent': 26048, 'inconsistent': 0}","{'consistent': 26048, 'inconsistent': 0}","{'consistent': 26048, 'inconsistent': 0}"
