# Stage 05 — Data Storage (CSV + Parquet, env-driven paths)


Imports & env paths

In [1]:
from __future__ import annotations
import os, sys, glob
from pathlib import Path
from datetime import datetime

import pandas as pd
from dotenv import load_dotenv

# Resolve project_root when running from project/notebooks
project_root = Path.cwd().resolve().parents[0] if Path.cwd().name == "notebooks" else Path.cwd().resolve()
sys.path.append(str(project_root / "src"))

load_dotenv(project_root / ".env")

from storage import env_paths, write_df, read_df

RAW_DIR, PROC_DIR = env_paths(project_root)
print("RAW_DIR:", RAW_DIR)
print("PROC_DIR:", PROC_DIR)

def stamp():
    return datetime.now().strftime("%Y%m%d-%H%M")


RAW_DIR: C:\Users\melin\OneDrive\Desktop\nyu\python\bootcamp_panagiotis_housos\project\data\raw
PROC_DIR: C:\Users\melin\OneDrive\Desktop\nyu\python\bootcamp_panagiotis_housos\project\data\processed


Pick a DataFrame to save

In [2]:
# Try to load the latest API CSV you saved in Stage 04
candidates = sorted(RAW_DIR.glob("api_*ASML_*.csv"))
if candidates:
    latest_raw = candidates[-1]
    print("Using existing raw:", latest_raw.name)
    df = pd.read_csv(latest_raw, parse_dates=["date"])
else:
    # Fallback sample (if no API file found)
    print("No ASML raw CSV found; using a small sample DF")
    df = pd.DataFrame({
        "date": pd.date_range("2024-01-01", periods=5, freq="D"),
        "open": [1,2,3,4,5],
        "high": [1.1,2.2,3.3,4.4,5.5],
        "low":  [0.9,1.8,2.7,3.6,4.5],
        "close":[1.0,2.1,3.2,4.3,5.4],
        "adjusted_close":[1.0,2.1,3.2,4.3,5.4],
        "volume":[100,120,110,90,80],
    })
print(df.dtypes)
df.head()


Using existing raw: api_yfinance_ASML_20250816-0017.csv
date              datetime64[ns]
open                     float64
high                     float64
low                      float64
close                    float64
adjusted_close           float64
volume                     int64
dtype: object


Unnamed: 0,date,open,high,low,close,adjusted_close,volume
0,2020-08-17,376.660004,380.5,376.25,378.51001,364.91925,428200
1,2020-08-18,383.609985,383.980011,376.98999,377.220001,363.675568,466500
2,2020-08-19,380.390015,380.429993,373.959991,374.940002,361.477417,354400
3,2020-08-20,369.540009,373.380005,368.380005,372.170013,358.806976,498200
4,2020-08-21,366.149994,372.130005,365.540009,372.119995,358.758667,605400


Save to CSV (raw) and Parquet (processed)

In [3]:
csv_path = RAW_DIR / f"sample_{stamp()}.csv"
parq_path = PROC_DIR / f"sample_{stamp()}.parquet"

# Save
write_df(df, csv_path)
write_df(df, parq_path)

csv_path, parq_path


(WindowsPath('C:/Users/melin/OneDrive/Desktop/nyu/python/bootcamp_panagiotis_housos/project/data/raw/sample_20250816-0041.csv'),
 WindowsPath('C:/Users/melin/OneDrive/Desktop/nyu/python/bootcamp_panagiotis_housos/project/data/processed/sample_20250816-0041.parquet'))

Reload both & validate

In [4]:
# Reload (CSV needs parse_dates to preserve 'date' dtype)
df_csv = read_df(csv_path, parse_dates=["date"])
df_parq = read_df(parq_path)

print("shapes:", df.shape, df_csv.shape, df_parq.shape)
print("dtypes (CSV):\n", df_csv.dtypes)
print("dtypes (PARQ):\n", df_parq.dtypes)

# Simple validation summary
def validate_reload(original: pd.DataFrame, re_csv: pd.DataFrame, re_parq: pd.DataFrame, date_col: str="date"):
    msgs = {}
    msgs["shape_match_csv"]  = (original.shape == re_csv.shape)
    msgs["shape_match_parq"] = (original.shape == re_parq.shape)
    msgs["date_is_datetime_csv"]  = (str(re_csv[date_col].dtype).startswith("datetime")) if date_col in re_csv.columns else None
    msgs["date_is_datetime_parq"] = (str(re_parq[date_col].dtype).startswith("datetime")) if date_col in re_parq.columns else None
    # Optional: check numeric columns stay numeric
    num_cols = original.select_dtypes("number").columns.tolist()
    msgs["numeric_cols_csv_ok"]  = all(pd.api.types.is_numeric_dtype(re_csv[c])  for c in num_cols if c in re_csv.columns)
    msgs["numeric_cols_parq_ok"] = all(pd.api.types.is_numeric_dtype(re_parq[c]) for c in num_cols if c in re_parq.columns)
    return msgs

validation = validate_reload(df, df_csv, df_parq)
validation


shapes: (1256, 7) (1256, 7) (1256, 7)
dtypes (CSV):
 date              datetime64[ns]
open                     float64
high                     float64
low                      float64
close                    float64
adjusted_close           float64
volume                     int64
dtype: object
dtypes (PARQ):
 date              datetime64[ns]
open                     float64
high                     float64
low                      float64
close                    float64
adjusted_close           float64
volume                     int64
dtype: object


{'shape_match_csv': True,
 'shape_match_parq': True,
 'date_is_datetime_csv': True,
 'date_is_datetime_parq': True,
 'numeric_cols_csv_ok': True,
 'numeric_cols_parq_ok': True}