# `load-data.ipynb`

# Sharpic Dataset

## Load raw data from CSVs

In [9]:
from pathlib import Path
import duckdb

# set the root directory where the CSV files are located
cwd = Path().cwd()
root_dir = cwd / '..' / '_raw_data'

raw_data_dir = root_dir / 'sharpicManU' / 'Glucose Data'

# load data from csvs:
csv_paths = str(raw_data_dir / '*.csv')
df_all = duckdb.from_csv_auto(csv_paths, filename=True)

### Show full dataset

In [10]:
df_all

┌──────────────────┬────────┬────────────────────────────────────────────────────────────────────────────────────────────┐
│      bg_ts       │ value  │                                          filename                                          │
│     varchar      │ double │                                          varchar                                           │
├──────────────────┼────────┼────────────────────────────────────────────────────────────────────────────────────────────┤
│ 01/10/2023 00:04 │    7.5 │ d:\RobbieDocuments\Data\notebooks\..\_raw_data\sharpicManU\Glucose Data\UoMGlucose2301.csv │
│ 01/10/2023 00:09 │    8.0 │ d:\RobbieDocuments\Data\notebooks\..\_raw_data\sharpicManU\Glucose Data\UoMGlucose2301.csv │
│ 01/10/2023 00:14 │    8.6 │ d:\RobbieDocuments\Data\notebooks\..\_raw_data\sharpicManU\Glucose Data\UoMGlucose2301.csv │
│ 01/10/2023 00:19 │    9.2 │ d:\RobbieDocuments\Data\notebooks\..\_raw_data\sharpicManU\Glucose Data\UoMGlucose2301.csv │
│ 01/10/2023 00:

In [11]:
df_view = df_all.create_view('glucose_data')
df_view.describe()

┌─────────┬──────────────────┬────────────────────┐
│  aggr   │      bg_ts       │       value        │
│ varchar │     varchar      │       double       │
├─────────┼──────────────────┼────────────────────┤
│ count   │ 356146           │           356146.0 │
│ mean    │ NULL             │  8.190111663007146 │
│ stddev  │ NULL             │ 3.1559837370187727 │
│ min     │ 01/01/2024 00:00 │                0.1 │
│ max     │ 31/12/2023 23:59 │               27.8 │
│ median  │ NULL             │                7.5 │
└─────────┴──────────────────┴────────────────────┘

In [12]:
df_view

┌──────────────────┬────────┬────────────────────────────────────────────────────────────────────────────────────────────┐
│      bg_ts       │ value  │                                          filename                                          │
│     varchar      │ double │                                          varchar                                           │
├──────────────────┼────────┼────────────────────────────────────────────────────────────────────────────────────────────┤
│ 01/10/2023 00:04 │    7.5 │ d:\RobbieDocuments\Data\notebooks\..\_raw_data\sharpicManU\Glucose Data\UoMGlucose2301.csv │
│ 01/10/2023 00:09 │    8.0 │ d:\RobbieDocuments\Data\notebooks\..\_raw_data\sharpicManU\Glucose Data\UoMGlucose2301.csv │
│ 01/10/2023 00:14 │    8.6 │ d:\RobbieDocuments\Data\notebooks\..\_raw_data\sharpicManU\Glucose Data\UoMGlucose2301.csv │
│ 01/10/2023 00:19 │    9.2 │ d:\RobbieDocuments\Data\notebooks\..\_raw_data\sharpicManU\Glucose Data\UoMGlucose2301.csv │
│ 01/10/2023 00:

## Export to parquet

In [13]:
# Use DuckDB's COPY command for efficient Parquet export
parquet_path = str(root_dir / 'sharpicManU' / 'GlucoseData.parquet')
duckdb.sql(f"""
    COPY glucose_data TO '{parquet_path}'
    (FORMAT PARQUET, COMPRESSION 'SNAPPY', PARTITION_BY (filename), OVERWRITE_OR_IGNORE TRUE)
""")
print(f"Data exported to {parquet_path}")

Data exported to d:\RobbieDocuments\Data\notebooks\..\_raw_data\sharpicManU\GlucoseData.parquet


## Export to `postgres`

In [14]:
from getpass import getpass

# PostgreSQL connection string
pg_password = getpass("Enter PostgreSQL password: ")
pg_conn_str = f"postgresql://postgres:{pg_password}@192.168.7.221:5432/pfun"

In [15]:
# Export DuckDB view/table to PostgreSQL
import pandas as pd

# Convert DuckDB view to Pandas DataFrame
df = df_view.to_df()

# Export DataFrame to PostgreSQL
df.to_sql('sharpic_glucose_data', con=pg_conn_str, if_exists='replace', index=False)

146

### Check that the data exists in postgres

In [16]:
df_smol = pd.read_sql_query("SELECT * FROM sharpic_glucose_data LIMIT 5;", con=pg_conn_str)

df_smol

Unnamed: 0,bg_ts,value,filename
0,01/10/2023 00:04,7.5,d:\RobbieDocuments\Data\notebooks\..\_raw_data...
1,01/10/2023 00:09,8.0,d:\RobbieDocuments\Data\notebooks\..\_raw_data...
2,01/10/2023 00:14,8.6,d:\RobbieDocuments\Data\notebooks\..\_raw_data...
3,01/10/2023 00:19,9.2,d:\RobbieDocuments\Data\notebooks\..\_raw_data...
4,01/10/2023 00:24,9.7,d:\RobbieDocuments\Data\notebooks\..\_raw_data...


---

# Dexcom Dataset

## Load Dexcom data from CSV

In [17]:
from pathlib import Path
import duckdb

# get the directory where the CSV files are located
raw_data_dir = root_dir / 'dexcom'

# load data from csvs:
csv_paths = str(raw_data_dir / '*.csv')
df_all = duckdb.from_csv_auto(csv_paths, filename=True)

In [18]:
df_all

┌──────────┬─────────────────────┬─────────────────────┬───────┬────────┬───────┬──────────┬──────────────┬────────────────┬─────────────────┬────────────────┬───────────────┬──────────────────────────────────────────────────────────────────────┐
│ user_id  │       ts_utc        │      ts_local       │ is_sg │   sg   │ is_fl │ meal_tag │ tag_bef_meal │ tag_after_meal │ tag_after_snack │ tag_seems_high │ tag_seems_low │                               filename                               │
│  int64   │      timestamp      │      timestamp      │ int64 │ double │ int64 │ varchar  │   boolean    │    boolean     │     boolean     │    boolean     │    boolean    │                               varchar                                │
├──────────┼─────────────────────┼─────────────────────┼───────┼────────┼───────┼──────────┼──────────────┼────────────────┼─────────────────┼────────────────┼───────────────┼──────────────────────────────────────────────────────────────────────┤
│ 10130489 │

In [19]:
df_view = df_all.create_view('dexcom_glucose_data')
df_view.describe()

┌─────────┬────────────┬─────────────────────┬─────────────────────┬────────────────────┬────────────────────┬──────────────────────┬───────────┬──────────────┬────────────────┬─────────────────┬────────────────┬───────────────┐
│  aggr   │  user_id   │       ts_utc        │      ts_local       │       is_sg        │         sg         │        is_fl         │ meal_tag  │ tag_bef_meal │ tag_after_meal │ tag_after_snack │ tag_seems_high │ tag_seems_low │
│ varchar │   double   │       varchar       │       varchar       │       double       │       double       │        double        │  varchar  │   varchar    │    varchar     │     varchar     │    varchar     │    varchar    │
├─────────┼────────────┼─────────────────────┼─────────────────────┼────────────────────┼────────────────────┼──────────────────────┼───────────┼──────────────┼────────────────┼─────────────────┼────────────────┼───────────────┤
│ count   │     2879.0 │ 2879                │ 2879                │             287

In [20]:
df_view

┌──────────┬─────────────────────┬─────────────────────┬───────┬────────┬───────┬──────────┬──────────────┬────────────────┬─────────────────┬────────────────┬───────────────┬──────────────────────────────────────────────────────────────────────┐
│ user_id  │       ts_utc        │      ts_local       │ is_sg │   sg   │ is_fl │ meal_tag │ tag_bef_meal │ tag_after_meal │ tag_after_snack │ tag_seems_high │ tag_seems_low │                               filename                               │
│  int64   │      timestamp      │      timestamp      │ int64 │ double │ int64 │ varchar  │   boolean    │    boolean     │     boolean     │    boolean     │    boolean    │                               varchar                                │
├──────────┼─────────────────────┼─────────────────────┼───────┼────────┼───────┼──────────┼──────────────┼────────────────┼─────────────────┼────────────────┼───────────────┼──────────────────────────────────────────────────────────────────────┤
│ 10130489 │

## Export to parquet

In [21]:
# Use DuckDB's COPY command for efficient Parquet export
parquet_path = str(root_dir / 'dexcom' / 'valid_data.parquet')
duckdb.sql(f"""
    COPY glucose_data TO '{parquet_path}'
    (FORMAT PARQUET, COMPRESSION 'SNAPPY', PARTITION_BY (filename), OVERWRITE_OR_IGNORE TRUE)
""")
print(f"Data exported to {parquet_path}")

Data exported to d:\RobbieDocuments\Data\notebooks\..\_raw_data\dexcom\valid_data.parquet


## Export to `postgres`

In [22]:
# from getpass import getpass

# # PostgreSQL connection string
# pg_password = getpass("Enter PostgreSQL password: ")
# pg_conn_str = f"postgresql://postgres:{pg_password}@192.168.7.221:5432/pfun"

In [23]:
# Export DuckDB view/table to PostgreSQL
import pandas as pd

# Convert DuckDB view to Pandas DataFrame
df = df_view.to_df()

# Export DataFrame to PostgreSQL
df.to_sql('dexcom_glucose_data', con=pg_conn_str, if_exists='replace', index=False)

879

### Check that the dexcom data exists in postgres

In [24]:
df_smol = pd.read_sql_query("SELECT * FROM dexcom_glucose_data LIMIT 5;", con=pg_conn_str)

df_smol

Unnamed: 0,user_id,ts_utc,ts_local,is_sg,sg,is_fl,meal_tag,tag_bef_meal,tag_after_meal,tag_after_snack,tag_seems_high,tag_seems_low,filename
0,10130489,2021-01-18 09:48:09,2021-01-18 01:48:09,1,112.0,0,,True,False,False,False,False,d:\RobbieDocuments\Data\notebooks\..\_raw_data...
1,10130489,2021-01-18 09:53:08,2021-01-18 01:53:08,1,102.0,0,,True,False,False,False,True,d:\RobbieDocuments\Data\notebooks\..\_raw_data...
2,10130489,2021-01-18 09:58:08,2021-01-18 01:58:08,1,99.0,0,,True,False,False,False,True,d:\RobbieDocuments\Data\notebooks\..\_raw_data...
3,10130489,2021-01-18 10:03:09,2021-01-18 02:03:09,1,101.0,0,,True,False,False,False,True,d:\RobbieDocuments\Data\notebooks\..\_raw_data...
4,10130489,2021-01-18 10:08:08,2021-01-18 02:08:08,1,102.0,0,,True,False,False,False,True,d:\RobbieDocuments\Data\notebooks\..\_raw_data...
