In [None]:
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import os

In [2]:
# Load the data
df = pd.read_csv(os.path.join('..', 'data', 'raw', 'train.csv'), na_values=np.nan, low_memory=False)

In [3]:
def parse_time_diff(time_diff_str: str) -> timedelta:
    # get a string in the format of -HH:MM and return a timedelta object
    is_negative = time_diff_str[0] == '-'

    if time_diff_str[0] in ['+', '-']:
        time_diff_str = time_diff_str[1:]

    hours, minutes = time_diff_str.split(':')
    if is_negative:
        hours = -int(hours)
        minutes = -int(minutes)

    return timedelta(hours=int(hours), minutes=int(minutes))

In [None]:
# parameters 
start_date = datetime(2000, 1, 1)
ids_with_date_change = []  # p11_4307 is the datapoint that changes the date

# convert the time column to a datetime object
df["time"] = pd.to_datetime(df["time"], format="%H:%M:%S").dt.time

# define temporary variables
date_times = []
last_processed_time = None
last_processed_p_num = None
current_date = None

for i, row in df.iterrows():
    # reset when we see a new patient
    if last_processed_p_num is not None and last_processed_p_num != row["p_num"]:
        last_processed_p_num = None
        last_processed_time = None
        current_date = None

    if last_processed_p_num is None:
        last_processed_p_num = row["p_num"]
    if last_processed_time is None:
        last_processed_time = row["time"]
    if current_date is None:
        current_date = start_date

    if row["time"] < last_processed_time or row["id"] in ids_with_date_change:
        current_date = current_date + timedelta(days=1)

    date_times.append(datetime.combine(current_date, row["time"]))
    last_processed_time = row["time"]

df = df.copy()
df.loc[:, "datetime"] = date_times
df = df.set_index("datetime")
df = df.drop(columns=["time"])

In [6]:
p_nums = df['p_num'].unique()
parameters = ['bg', 'insulin', 'carbs', 'hr', 'steps', 'cals', 'activity', 'target']
time_diffs = [
    '-0:00',
    '-0:05',
    '-0:10',
    '-0:15',
    '-0:20',
    '-0:25',
    '-0:30',
    '-0:35',
    '-0:40',
    '-0:45',
    '-0:50',
    '-0:55',
    '-1:00',
    '-1:05',
    '-1:10',
    '-1:15',
    '-1:20',
    '-1:25',
    '-1:30',
    '-1:35',
    '-1:40',
    '-1:45',
    '-1:50',
    '-1:55',
    '-2:00',
    '-2:05',
    '-2:10',
    '-2:15',
    '-2:20',
    '-2:25',
    '-2:30',
    '-2:35',
    '-2:40',
    '-2:45',
    '-2:50',
    '-2:55',
    '-3:00',
    '-3:05',
    '-3:10',
    '-3:15',
    '-3:20',
    '-3:25',
    '-3:30',
    '-3:35',
    '-3:40',
    '-3:45',
    '-3:50',
    '-3:55',
    '-4:00',
    '-4:05',
    '-4:10',
    '-4:15',
    '-4:20',
    '-4:25',
    '-4:30',
    '-4:35',
    '-4:40',
    '-4:45',
    '-4:50',
    '-4:55',
    '-5:00',
    '-5:05',
    '-5:10',
    '-5:15',
    '-5:20',
    '-5:25',
    '-5:30',
    '-5:35',
    '-5:40',
    '-5:45',
    '-5:50',
    '-5:55'
]

result_dict = {}
for p_num in p_nums:
    result_dict[p_num] = {}
    for parameter in parameters:
        df_patient = df[df['p_num'] == p_num]

        new_df = None

        # This is a special case for the target parameter bg+1:00
        if parameter == 'target':
            time_diff_str = '+1:00'
            new_df = df_patient[['id', 'p_num', f'bg-0:00']].copy()
            time_diff = parse_time_diff(time_diff_str)
            values = df_patient['bg+1:00'].copy()
            values.index = values.index + time_diff
            values.index.name = "datetime"

            # add rows that are not in df
            new_df = new_df.reindex(df_patient.index.union(values.index))
            # join the values to the new_df
            new_df = new_df.join(values.rename('bg+1:00-1:00'), on="datetime")

        # This is the general case        
        if parameter != 'target':
            new_df = df_patient[['id', 'p_num', f'{parameter}-0:00']].copy()
            for time_diff_id, time_diff_str in enumerate(time_diffs):
                if time_diff_str == '-0:00':
                    continue

                if not df_patient.columns.str.contains(f"{parameter}{time_diff_str}").any():
                    continue

                time_diff = parse_time_diff(time_diff_str)
                values = df_patient[f"{parameter}{time_diff_str}"].copy()
                values.index = values.index + time_diff
                values.index.name = "datetime"

                # add rows that are not in df
                new_df = new_df.reindex(df_patient.index.union(values.index))

                # join the values to the new_df
                new_df = new_df.join(values.rename(f"{parameter}{time_diff_str}+{time_diff_str}"), on="datetime")

        # analyse the data
        # check for unique values in the columns except for the id and p_num columns
        new_df = new_df.set_index("id", drop=True)
        unique_values = new_df.drop(columns=["p_num"]).nunique(axis=1)
        unique_values = unique_values[unique_values > 1]

        result_dict[p_num][parameter] = unique_values.shape[0]
        result_dict[p_num]['total'] = new_df.shape[0]

In [7]:
result = pd.DataFrame.from_dict(result_dict, columns=parameters + ['total'], orient='index')
result

Unnamed: 0,bg,insulin,carbs,hr,steps,cals,activity,target,total
p01,0,0,0,0,0,0,0,0,9210
p02,0,0,0,0,0,0,0,0,26011
p03,0,0,0,0,0,0,0,0,26150
p04,0,0,0,0,0,0,0,0,24852
p05,0,0,0,0,0,0,0,0,8989
p06,0,0,0,0,0,0,0,0,8941
p10,0,0,0,0,0,0,0,0,25596
p11,0,0,0,68,26,45,0,8,24709
p12,0,0,0,0,0,0,0,0,25497
