In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
%matplotlib inline

In [6]:
def preprocess_file(file_path: str) -> pd.DataFrame:
    curr = pd.read_csv(file_path, delimiter=';')
    
    if curr.isna().sum().sum():
        print(f"Null values found in: {file_path}")
        return None
    
    curr["time"] = pd.to_datetime(curr["time"])
    curr.sort_values("time", inplace=True)
    time_deltas = curr["time"].diff().dropna()
    
    if not (time_deltas == pd.Timedelta(minutes=5)).all():
        print(f"The samples were not in the 5 min interval here: {file_path}")
        return None
    
    return curr

In [7]:
def preprocess_data(dir_path: str) -> pd.DataFrame:
    all_dfs = []
    files = [f for f in os.listdir(dir_path) if f.endswith(".csv")]
    
    for index, filename in enumerate(sorted(files)):
        full_path = os.path.join(dir_path, filename)
        curr = preprocess_file(full_path)
        if curr is not None:
            curr["user_id"] = index + 1
            all_dfs.append(curr)
            
    return pd.concat(all_dfs, ignore_index=True)

In [8]:
db = preprocess_data("data")
db.head()

Unnamed: 0,time,glucose,calories,heart_rate,steps,basal_rate,bolus_volume_delivered,carb_input,user_id
0,2018-06-13 18:40:00,332.0,6.3595,82.322835,34.0,0.091667,0.0,0.0,1
1,2018-06-13 18:45:00,326.0,7.728,83.740157,0.0,0.091667,0.0,0.0,1
2,2018-06-13 18:50:00,330.0,4.7495,80.52518,0.0,0.091667,0.0,0.0,1
3,2018-06-13 18:55:00,324.0,6.3595,89.129032,20.0,0.091667,0.0,0.0,1
4,2018-06-13 19:00:00,306.0,5.152,92.495652,0.0,0.075,0.0,0.0,1


In [9]:
db.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309392 entries, 0 to 309391
Data columns (total 9 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   time                    309392 non-null  datetime64[ns]
 1   glucose                 309392 non-null  float64       
 2   calories                309392 non-null  float64       
 3   heart_rate              309392 non-null  float64       
 4   steps                   309392 non-null  float64       
 5   basal_rate              309392 non-null  float64       
 6   bolus_volume_delivered  309392 non-null  float64       
 7   carb_input              309392 non-null  float64       
 8   user_id                 309392 non-null  int64         
dtypes: datetime64[ns](1), float64(7), int64(1)
memory usage: 21.2 MB


In [10]:
db.describe()

Unnamed: 0,time,glucose,calories,heart_rate,steps,basal_rate,bolus_volume_delivered,carb_input,user_id
count,309392,309392.0,309392.0,309392.0,309392.0,309392.0,309392.0,309392.0,309392.0
mean,2020-09-24 13:06:07.680159232,141.425051,8.813568,76.990001,30.825005,0.041324,0.066058,0.052718,20.813066
min,2018-06-13 18:40:00,40.0,0.0,32.407773,0.0,0.0,-3.0,0.0,1.0
25%,2020-01-30 11:00:00,99.666667,5.8461,64.930233,0.0,0.0,0.0,0.0,22.0
50%,2020-10-31 16:52:30,132.0,6.2781,75.418726,0.0,0.056,0.0,0.0,24.0
75%,2021-07-27 06:31:15,173.0,9.19306,85.612685,11.0,0.066,0.0,0.0,24.0
max,2022-05-18 12:15:00,444.0,106.35,195.615385,842.0,0.25,19.8,130.0,25.0
std,,57.085587,6.930449,15.546699,84.981109,0.036106,0.755075,1.505433,6.283139


In [11]:
db.to_csv("data/db.csv")

In [13]:
synthetic = pd.read_csv("synthetic/synt.csv")
labels = synthetic.iloc[:, 0]
synthetic = synthetic.iloc[:, 1:]
synthetic.columns = [f"t_{i}" for i in range(1, synthetic.shape[1] + 1)]
synthetic = pd.concat([labels.rename("type"), synthetic], axis=1)
#synthetic.to_csv("synthetic/prepr_synt.csv")

In [14]:
def make_smaller_by_type(df: pd.DataFrame, typ: int, num: int) -> pd.DataFrame:
    smaller_df = df[df["type"] == typ]
    smaller_df = smaller_df.iloc[:num,]
    return smaller_df