# `load-data.ipynb`

## Load data from CSVs

In [2]:
from pathlib import Path
import duckdb

# set the root directory where the CSV files are located
cwd = Path().cwd()
root_dir = cwd / '..' / '_raw_data' / 'sharpicManU'

raw_data_dir = root_dir / 'Glucose Data'

# load data from csvs:
csv_paths = str(raw_data_dir / '*.csv')
df_all = duckdb.from_csv_auto(csv_paths, filename=True)

### Show full dataset

In [3]:
df_all

┌──────────────────┬────────┬────────────────────────────────────────────────────────────────────────────────────────────┐
│      bg_ts       │ value  │                                          filename                                          │
│     varchar      │ double │                                          varchar                                           │
├──────────────────┼────────┼────────────────────────────────────────────────────────────────────────────────────────────┤
│ 01/10/2023 00:04 │    7.5 │ d:\RobbieDocuments\Data\notebooks\..\_raw_data\sharpicManU\Glucose Data\UoMGlucose2301.csv │
│ 01/10/2023 00:09 │    8.0 │ d:\RobbieDocuments\Data\notebooks\..\_raw_data\sharpicManU\Glucose Data\UoMGlucose2301.csv │
│ 01/10/2023 00:14 │    8.6 │ d:\RobbieDocuments\Data\notebooks\..\_raw_data\sharpicManU\Glucose Data\UoMGlucose2301.csv │
│ 01/10/2023 00:19 │    9.2 │ d:\RobbieDocuments\Data\notebooks\..\_raw_data\sharpicManU\Glucose Data\UoMGlucose2301.csv │
│ 01/10/2023 00:

In [4]:
df_view = df_all.create_view('glucose_data')
df_view.describe()

┌─────────┬──────────────────┬────────────────────┐
│  aggr   │      bg_ts       │       value        │
│ varchar │     varchar      │       double       │
├─────────┼──────────────────┼────────────────────┤
│ count   │ 356146           │           356146.0 │
│ mean    │ NULL             │  8.190111663007132 │
│ stddev  │ NULL             │ 3.1559837370187713 │
│ min     │ 01/01/2024 00:00 │                0.1 │
│ max     │ 31/12/2023 23:59 │               27.8 │
│ median  │ NULL             │                7.5 │
└─────────┴──────────────────┴────────────────────┘

In [5]:
df_view

┌──────────────────┬────────┬────────────────────────────────────────────────────────────────────────────────────────────┐
│      bg_ts       │ value  │                                          filename                                          │
│     varchar      │ double │                                          varchar                                           │
├──────────────────┼────────┼────────────────────────────────────────────────────────────────────────────────────────────┤
│ 01/10/2023 00:04 │    7.5 │ d:\RobbieDocuments\Data\notebooks\..\_raw_data\sharpicManU\Glucose Data\UoMGlucose2301.csv │
│ 01/10/2023 00:09 │    8.0 │ d:\RobbieDocuments\Data\notebooks\..\_raw_data\sharpicManU\Glucose Data\UoMGlucose2301.csv │
│ 01/10/2023 00:14 │    8.6 │ d:\RobbieDocuments\Data\notebooks\..\_raw_data\sharpicManU\Glucose Data\UoMGlucose2301.csv │
│ 01/10/2023 00:19 │    9.2 │ d:\RobbieDocuments\Data\notebooks\..\_raw_data\sharpicManU\Glucose Data\UoMGlucose2301.csv │
│ 01/10/2023 00:

## Export to parquet

In [6]:
# Use DuckDB's COPY command for efficient Parquet export
parquet_path = str(root_dir / 'GlucoseData.parquet')
duckdb.sql(f"""
    COPY glucose_data TO '{parquet_path}'
    (FORMAT PARQUET, COMPRESSION 'SNAPPY', PARTITION_BY (filename), OVERWRITE_OR_IGNORE TRUE)
""")
print(f"Data exported to {parquet_path}")

Data exported to d:\RobbieDocuments\Data\notebooks\..\_raw_data\sharpicManU\GlucoseData.parquet


## Export to `postgres`

In [13]:
from getpass import getpass

# PostgreSQL connection string
pg_password = getpass("Enter PostgreSQL password: ")
pg_conn_str = f"postgresql://postgres:{pg_password}@192.168.7.221:5432/pfun"

In [16]:
# Export DuckDB view/table to PostgreSQL
import pandas as pd

# Convert DuckDB view to Pandas DataFrame
df = df_view.to_df()

# Export DataFrame to PostgreSQL
df.to_sql('glucose_data', con=pg_conn_str, if_exists='replace', index=False)

146

### Check that the data exists in postgres

In [17]:
df_smol = pd.read_sql_query("SELECT * FROM glucose_data LIMIT 5;", con=pg_conn_str)

df_smol

Unnamed: 0,bg_ts,value,filename
0,01/10/2023 00:04,7.5,d:\RobbieDocuments\Data\notebooks\..\_raw_data...
1,01/10/2023 00:09,8.0,d:\RobbieDocuments\Data\notebooks\..\_raw_data...
2,01/10/2023 00:14,8.6,d:\RobbieDocuments\Data\notebooks\..\_raw_data...
3,01/10/2023 00:19,9.2,d:\RobbieDocuments\Data\notebooks\..\_raw_data...
4,01/10/2023 00:24,9.7,d:\RobbieDocuments\Data\notebooks\..\_raw_data...
