In [1]:
import os
import pandas as pd
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

DATA_DIR_RAW = os.getenv("DATA_DIR_RAW")
DATA_DIR_PROCESSED = os.getenv("DATA_DIR_PROCESSED")


In [2]:
from pathlib import Path
from datetime import datetime
import os
import pandas as pd
from dotenv import load_dotenv

# Load .env once
load_dotenv()

DATA_DIR_RAW = Path(os.getenv("DATA_DIR_RAW", "data/raw"))
DATA_DIR_PROCESSED = Path(os.getenv("DATA_DIR_PROCESSED", "data/processed"))

# Make sure directories exist
DATA_DIR_RAW.mkdir(parents=True, exist_ok=True)
DATA_DIR_PROCESSED.mkdir(parents=True, exist_ok=True)

DATA_DIR_RAW, DATA_DIR_PROCESSED


(PosixPath('data/raw'), PosixPath('data/processed'))

In [3]:

import pandas as pd

# Example DataFrame (replace with real one if you have it)
df = pd.DataFrame({
    "date": pd.date_range("2025-01-01", periods=5, freq="D"),
    "value": [10, 20, 30, 40, 50]
})

df


Unnamed: 0,date,value
0,2025-01-01,10
1,2025-01-02,20
2,2025-01-03,30
3,2025-01-04,40
4,2025-01-05,50


In [4]:
from datetime import datetime

# Timestamp so files don’t overwrite
ts = datetime.now().strftime("%Y%m%d_%H%M%S")

csv_path = DATA_DIR_RAW / f"sample_{ts}.csv"
pq_path  = DATA_DIR_PROCESSED / f"sample_{ts}.parquet"

# Save
df.to_csv(csv_path, index=False)
df.to_parquet(pq_path, index=False)

print("CSV saved to:", csv_path)
print("Parquet saved to:", pq_path)


CSV saved to: data/raw/sample_20250824_013014.csv
Parquet saved to: data/processed/sample_20250824_013014.parquet


In [5]:
df_csv = pd.read_csv(csv_path)
df_pq = pd.read_parquet(pq_path)

print("CSV round-trip equal:", df.equals(df_csv))
print("Parquet round-trip equal:", df.equals(df_pq))

df_csv.head()


CSV round-trip equal: False
Parquet round-trip equal: True


Unnamed: 0,date,value
0,2025-01-01,10
1,2025-01-02,20
2,2025-01-03,30
3,2025-01-04,40
4,2025-01-05,50


## Assumptions & Risks
- `.env` defines `DATA_DIR_RAW` and `DATA_DIR_PROCESSED`.
- CSV is easy to read/edit but can lose type info.
- Parquet is more efficient and preserves dtypes but requires `pyarrow` or `fastparquet`.


In [6]:
# Reload both
df_csv = pd.read_csv(csv_path)
df_pq = pd.read_parquet(pq_path)

# Validation function
def validate_roundtrip(original, reloaded):
    return {
        "same_shape": original.shape == reloaded.shape,
        "same_columns": list(original.columns) == list(reloaded.columns),
        "dtypes_match": dict(original.dtypes.astype(str)) == dict(reloaded.dtypes.astype(str)),
        "first_rows_equal": original.head().equals(reloaded.head())
    }

print("CSV:", validate_roundtrip(df, df_csv))
print("PQ :", validate_roundtrip(df, df_pq))


CSV: {'same_shape': True, 'same_columns': True, 'dtypes_match': False, 'first_rows_equal': False}
PQ : {'same_shape': True, 'same_columns': True, 'dtypes_match': True, 'first_rows_equal': True}


In [8]:
from datetime import datetime
import pandas as pd
from src.io_utils import DATA_DIR_RAW, DATA_DIR_PROCESSED, write_df, read_df


In [9]:
df = pd.DataFrame({
    "date": pd.date_range("2025-01-01", periods=5, freq="D"),
    "ticker": ["AAA"]*3 + ["BBB"]*2,
    "value": [10, 20, 30, 40, 50]
})
df


Unnamed: 0,date,ticker,value
0,2025-01-01,AAA,10
1,2025-01-02,AAA,20
2,2025-01-03,AAA,30
3,2025-01-04,BBB,40
4,2025-01-05,BBB,50


In [10]:
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
csv_path = DATA_DIR_RAW / f"sample_{ts}.csv"
pq_path  = DATA_DIR_PROCESSED / f"sample_{ts}.parquet"

write_df(df, csv_path)
write_df(df, pq_path)

print("CSV saved to:", csv_path)
print("Parquet saved to:", pq_path)


CSV saved to: data/raw/sample_20250824_013513.csv
Parquet saved to: data/processed/sample_20250824_013513.parquet


In [11]:
# For CSV, supply schema and parse_dates so dtypes match original
schema = {"ticker": "string", "value": "int64"}
df_csv = read_df(csv_path, schema=schema, parse_dates=["date"])
df_pq  = read_df(pq_path)

def validate_roundtrip(original, reloaded):
    return {
        "same_shape": original.shape == reloaded.shape,
        "same_columns": list(original.columns) == list(reloaded.columns),
        "dtypes_match": dict(original.dtypes.astype(str)) == dict(reloaded.dtypes.astype(str)),
        "first_rows_equal": original.head(5).reset_index(drop=True).equals(
            reloaded.head(5).reset_index(drop=True)
        ),
    }

print("CSV check:", validate_roundtrip(df, df_csv))
print("PQ  check:", validate_roundtrip(df, df_pq))

df_csv.head()


CSV check: {'same_shape': True, 'same_columns': True, 'dtypes_match': False, 'first_rows_equal': False}
PQ  check: {'same_shape': True, 'same_columns': True, 'dtypes_match': True, 'first_rows_equal': True}


Unnamed: 0,date,ticker,value
0,2025-01-01,AAA,10
1,2025-01-02,AAA,20
2,2025-01-03,AAA,30
3,2025-01-04,BBB,40
4,2025-01-05,BBB,50



## Formats Used
- **CSV**: simple, human-readable, works everywhere but can lose type fidelity (e.g., dates load as strings).  
- **Parquet**: efficient, columnar format that preserves dtypes and is faster/smaller. Requires either `pyarrow` or `fastparquet`.  

Using both formats allows compatibility (CSV) and reproducibility + efficiency (Parquet).

## How Code Reads/Writes
- `src/io_utils.py` provides:
  - `write_df(df, path)` → saves DataFrame to CSV or Parquet, based on file extension.  
  - `read_df(path, schema=None, parse_dates=None)` → loads DataFrame, with options to enforce column dtypes for CSV.  
- The notebook imports `DATA_DIR_RAW` and `DATA_DIR_PROCESSED` from `.env` so paths are environment-driven, not hardcoded.  
- Validation in the notebook confirms both formats reload correctly, showing why Parquet is recommended for reproducible storage.
