## imports + env paths

In [9]:
# make repo-root modules importable from inside /notebooks
from pathlib import Path
import sys

ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

print("Repo root:", ROOT)  # sanity check


Repo root: /Users/rajpawar/bootcamp_Rajvardhan_Pawar


In [11]:
from pathlib import Path
import os, pandas as pd, numpy as np
from datetime import datetime

# Tiny env helper (works with your earlier pattern)
from src.config import load_env, get
load_env()

RAW_DIR = Path(get("DATA_DIR_RAW") or "data/raw")
PROC_DIR = Path(get("DATA_DIR_PROCESSED") or "data/processed")
RAW_DIR.mkdir(parents=True, exist_ok=True)
PROC_DIR.mkdir(parents=True, exist_ok=True)

TS = datetime.now().strftime("%Y%m%d-%H%M")
RAW_DIR, PROC_DIR, TS


(PosixPath('data/raw'), PosixPath('data/processed'), '20250818-1609')

In [13]:
aapl_csv = next((p for p in RAW_DIR.glob("AAPL*.csv")), None)
if aapl_csv and aapl_csv.exists():
    df = pd.read_csv(aapl_csv, parse_dates=["date"], low_memory=False)
else:
    # small demo if AAPL isn’t present
    df = pd.DataFrame({
        "date": pd.date_range("2024-01-01", periods=5, freq="D"),
        "value": np.arange(5, dtype=float),
        "flag": ["A","B","A","B","A"]
    })
df.head(), df.dtypes


(   level_0  index       date       close        high         low        open  \
 0        0      0 2024-08-19  224.843582  224.943125  222.006778  224.674371   
 1        1      1 2024-08-20  225.460693  226.117640  224.405606  224.724131   
 2        2      2 2024-08-21  225.351212  226.923894  224.007474  225.470666   
 3        3      3 2024-08-22  223.489868  227.282216  222.862782  226.734761   
 4        4      4 2024-08-23  225.789154  227.162766  223.290787  224.614628   
 
      volume  
 0  40687800  
 1  30299000  
 2  34765500  
 3  43695300  
 4  38677300  ,
 level_0             int64
 index               int64
 date       datetime64[ns]
 close             float64
 high              float64
 low               float64
 open              float64
 volume              int64
 dtype: object)

In [15]:
csv_path  = RAW_DIR  / f"sample_{TS}.csv"
pq_path   = PROC_DIR / f"sample_{TS}.parquet"

df.to_csv(csv_path, index=False)
df.to_parquet(pq_path, index=False)        # uses pyarrow if installed

csv_path, pq_path


(PosixPath('data/raw/sample_20250818-1609.csv'),
 PosixPath('data/processed/sample_20250818-1609.parquet'))

In [17]:
df_csv = pd.read_csv(csv_path, parse_dates=["date"], low_memory=False)

# For Parquet, pandas preserves dtypes well; just load.
df_pq  = pd.read_parquet(pq_path)

def validate_reload(original: pd.DataFrame, csv_df: pd.DataFrame, pq_df: pd.DataFrame):
    out = {}

    out["shape_csv"] = (original.shape == csv_df.shape)
    out["shape_pq"]  = (original.shape == pq_df.shape)

    # pick critical columns (adapt to your DF)
    critical = [c for c in original.columns[:3]]  # first 3 as example
    out["dtypes_csv"] = {c: (original[c].dtype == csv_df[c].dtype) for c in critical if c in csv_df}
    out["dtypes_pq"]  = {c: (original[c].dtype == pq_df[c].dtype) for c in critical if c in pq_df}

    return out

validate_reload(df, df_csv, df_pq)


{'shape_csv': True,
 'shape_pq': True,
 'dtypes_csv': {'level_0': True, 'index': True, 'date': True},
 'dtypes_pq': {'level_0': True, 'index': True, 'date': True}}

In [20]:
from src.io_utils import write_df, read_df

csv2 = RAW_DIR  / f"sample_util_{TS}.csv"
pq2  = PROC_DIR / f"sample_util_{TS}.parquet"

write_df(df, csv2)
write_df(df, pq2)

df_csv2 = read_df(csv2)
df_pq2  = read_df(pq2)
validate_reload(df, df_csv2, df_pq2)


{'shape_csv': True,
 'shape_pq': True,
 'dtypes_csv': {'level_0': True, 'index': True, 'date': False},
 'dtypes_pq': {'level_0': True, 'index': True, 'date': True}}

In [22]:
from src.config import load_env, get
load_env()
RAW_DIR  = Path(get("DATA_DIR_RAW"))
PROC_DIR = Path(get("DATA_DIR_PROCESSED"))
