# `load-data.ipynb`

## Load data from CSVs

In [2]:
from pathlib import Path
import duckdb

# set the root directory where the CSV files are located
cwd = Path().cwd()
root_dir = cwd / '_raw_data' / 'sharpicManU'

raw_data_dir = root_dir / 'Glucose Data'

# load data from csvs:
csv_paths = str(raw_data_dir / '*.csv')
df_all = duckdb.from_csv_auto(csv_paths, filename=True)

### Show full dataset

In [3]:
df_all

┌──────────────────┬────────┬───────────────────────────────────────────────────────────────────────────────┐
│      bg_ts       │ value  │                                   filename                                    │
│     varchar      │ double │                                    varchar                                    │
├──────────────────┼────────┼───────────────────────────────────────────────────────────────────────────────┤
│ 01/10/2023 00:04 │    7.5 │ d:\RobbieDocuments\Data\_raw_data\sharpicManU\Glucose Data\UoMGlucose2301.csv │
│ 01/10/2023 00:09 │    8.0 │ d:\RobbieDocuments\Data\_raw_data\sharpicManU\Glucose Data\UoMGlucose2301.csv │
│ 01/10/2023 00:14 │    8.6 │ d:\RobbieDocuments\Data\_raw_data\sharpicManU\Glucose Data\UoMGlucose2301.csv │
│ 01/10/2023 00:19 │    9.2 │ d:\RobbieDocuments\Data\_raw_data\sharpicManU\Glucose Data\UoMGlucose2301.csv │
│ 01/10/2023 00:24 │    9.7 │ d:\RobbieDocuments\Data\_raw_data\sharpicManU\Glucose Data\UoMGlucose2301.csv │
│ 01/10/20

In [8]:
df_view = df_all.create_view('glucose_data')
df_view.describe()

┌─────────┬──────────────────┬───────────────────┐
│  aggr   │      bg_ts       │       value       │
│ varchar │     varchar      │      double       │
├─────────┼──────────────────┼───────────────────┤
│ count   │ 356146           │          356146.0 │
│ mean    │ NULL             │ 8.190111663007146 │
│ stddev  │ NULL             │ 3.155983737018771 │
│ min     │ 01/01/2024 00:00 │               0.1 │
│ max     │ 31/12/2023 23:59 │              27.8 │
│ median  │ NULL             │               7.5 │
└─────────┴──────────────────┴───────────────────┘

In [11]:
df_view

┌──────────────────┬────────┬───────────────────────────────────────────────────────────────────────────────┐
│      bg_ts       │ value  │                                   filename                                    │
│     varchar      │ double │                                    varchar                                    │
├──────────────────┼────────┼───────────────────────────────────────────────────────────────────────────────┤
│ 01/10/2023 00:04 │    7.5 │ d:\RobbieDocuments\Data\_raw_data\sharpicManU\Glucose Data\UoMGlucose2301.csv │
│ 01/10/2023 00:09 │    8.0 │ d:\RobbieDocuments\Data\_raw_data\sharpicManU\Glucose Data\UoMGlucose2301.csv │
│ 01/10/2023 00:14 │    8.6 │ d:\RobbieDocuments\Data\_raw_data\sharpicManU\Glucose Data\UoMGlucose2301.csv │
│ 01/10/2023 00:19 │    9.2 │ d:\RobbieDocuments\Data\_raw_data\sharpicManU\Glucose Data\UoMGlucose2301.csv │
│ 01/10/2023 00:24 │    9.7 │ d:\RobbieDocuments\Data\_raw_data\sharpicManU\Glucose Data\UoMGlucose2301.csv │
│ 01/10/20

## Export to parquet

In [14]:
# Use DuckDB's COPY command for efficient Parquet export
parquet_path = str(root_dir / 'GlucoseData.parquet')
duckdb.sql(f"""
    COPY glucose_data TO '{parquet_path}'
    (FORMAT PARQUET, COMPRESSION 'SNAPPY', PARTITION_BY (filename), OVERWRITE_OR_IGNORE TRUE)
""")