# Stage 05 — Data Storage Homework Notebook

This notebook saves/loads a sample DataFrame to CSV and Parquet using environment-driven paths, validates reloads, and refactors I/O into utilities.

In [None]:
# Env + setup
from pathlib import Path
import os
import pandas as pd

# Load .env if present
try:
    from dotenv import load_dotenv
    load_dotenv()
except Exception:
    pass

# Environment-driven directories
DATA_DIR_RAW = Path(os.getenv("DATA_DIR_RAW", "data/raw"))
DATA_DIR_PROCESSED = Path(os.getenv("DATA_DIR_PROCESSED", "data/processed"))

DATA_DIR_RAW.mkdir(parents=True, exist_ok=True)
DATA_DIR_PROCESSED.mkdir(parents=True, exist_ok=True)

DATA_DIR_RAW, DATA_DIR_PROCESSED

In [None]:
# Sample DataFrame 
import numpy as np
import time

df = pd.DataFrame({
    "id": np.arange(1, 6, dtype=np.int64),
    "city": ["NYC", "Boston", "Miami", "Houston", "LA"],
    "value": np.array([10.5, 11.0, 9.75, 13.2, 12.6], dtype=np.float64),
})

timestamp = time.strftime("%Y%m%d-%H%M")
csv_path = DATA_DIR_RAW / f"sample_{timestamp}.csv"
pq_path = DATA_DIR_PROCESSED / f"sample_{timestamp}.parquet"
df.head()

In [None]:
# 1) Save in two formats
df.to_csv(csv_path, index=False)
# Parquet requires pyarrow
try:
    import pyarrow  # noqa: F401
    df.to_parquet(pq_path, index=False)
except Exception as e:
    print("Parquet write skipped — install pyarrow to enable Parquet writes. Error:", e)

csv_path, pq_path

In [None]:
# 2) Reload and Validate
# Reload CSV (always available)
csv_df = pd.read_csv(csv_path)

# Reload Parquet if available
try:
    import pyarrow  # noqa: F401
    pq_df = pd.read_parquet(pq_path)
except Exception:
    pq_df = None

# Define critical dtypes we expect
critical_dtypes = {
    "id": "int64",
    "city": "object",
    "value": "float64",
}

def validate_df(original: pd.DataFrame, reloaded: pd.DataFrame, critical_dtypes: dict) -> dict:
    shape_match = original.shape == reloaded.shape
    dtype_issues = {}
    for col, expected in critical_dtypes.items():
        if col not in reloaded.columns:
            dtype_issues[col] = f"missing (expected {expected})"
        else:
            got = str(reloaded[col].dtype)
            if got != expected:
                dtype_issues[col] = f"{got} (expected {expected})"
    passed = shape_match and (len(dtype_issues) == 0)
    return {"shape_match": shape_match, "dtype_issues": dtype_issues, "passed": passed}

csv_validation = validate_df(df, csv_df, critical_dtypes)
pq_validation = validate_df(df, pq_df, critical_dtypes) if pq_df is not None else {"skipped": True}

csv_validation, pq_validation

In [None]:
# 3) Refactor to utilities
from pathlib import Path
import importlib.util

# Import our local utilities
import sys
PROJECT_ROOT = Path.cwd().parent if (Path.cwd().name == "notebooks") else Path.cwd()
sys.path.append(str(PROJECT_ROOT / "src"))
from utils import write_df, read_df, validate_df as validate_utils

csv2_path = DATA_DIR_RAW / f"sample_util_{timestamp}.csv"
pq2_path = DATA_DIR_PROCESSED / f"sample_util_{timestamp}.parquet"

# Write using utilities
_ = write_df(df, csv2_path)
try:
    _ = write_df(df, pq2_path)
except RuntimeError as e:
    print(e)

# Read using utilities
csv2_df = read_df(csv2_path)
try:
    pq2_df = read_df(pq2_path)
except Exception:
    pq2_df = None

# Validate
csv2_validation = validate_utils(df, csv2_df, critical_dtypes)
pq2_validation = validate_utils(df, pq2_df, critical_dtypes) if pq2_df is not None else {"skipped": True}

csv2_validation, pq2_validation

In [None]:
# 4) Clear output summary
summary = {
    "csv_saved": str(csv_path),
    "parquet_saved": str(pq_path) if (pq_df is not None) else "skipped",
    "csv_validation": csv_validation,
    "parquet_validation": pq_validation,
    "csv2_saved": str(csv2_path),
    "parquet2_saved": str(pq2_path) if (pq2_df is not None) else "skipped",
    "csv2_validation": csv2_validation,
    "parquet2_validation": pq2_validation,
}
summary