# Stage 06 — Data Preprocessing (ASML)
Load raw → clean (fill/drop/normalize) → compare → save cleaned dataset to `project/data/processed/`.


Imports, paths, env

In [1]:
from __future__ import annotations
import sys
from pathlib import Path
from datetime import datetime

import pandas as pd
import numpy as np
from dotenv import load_dotenv

# Resolve project root when running from project/notebooks
project_root = Path.cwd().resolve().parents[0] if Path.cwd().name == "notebooks" else Path.cwd().resolve()
sys.path.append(str(project_root / "src"))

load_dotenv(project_root / ".env")

from storage import env_paths, write_df, read_df
from cleaning import fill_missing_median, drop_missing, normalize_data

RAW_DIR, PROC_DIR = env_paths(project_root)
RAW_DIR, PROC_DIR


(WindowsPath('C:/Users/melin/OneDrive/Desktop/nyu/python/bootcamp_panagiotis_housos/project/data/raw'),
 WindowsPath('C:/Users/melin/OneDrive/Desktop/nyu/python/bootcamp_panagiotis_housos/project/data/processed'))

Load the latest raw ASML CSV (from Stage 04)

In [2]:
candidates = sorted(RAW_DIR.glob("api_*ASML_*.csv"))
if not candidates:
    raise FileNotFoundError("No ASML raw CSV found in project/data/raw/. Run Stage 04 first.")
raw_path = candidates[-1]
print("Using raw file:", raw_path.name)

df_raw = pd.read_csv(raw_path, parse_dates=["date"])
print(df_raw.shape)
df_raw.head()


Using raw file: api_yfinance_ASML_20250816-0017.csv
(1256, 7)


Unnamed: 0,date,open,high,low,close,adjusted_close,volume
0,2020-08-17,376.660004,380.5,376.25,378.51001,364.91925,428200
1,2020-08-18,383.609985,383.980011,376.98999,377.220001,363.675568,466500
2,2020-08-19,380.390015,380.429993,373.959991,374.940002,361.477417,354400
3,2020-08-20,369.540009,373.380005,368.380005,372.170013,358.806976,498200
4,2020-08-21,366.149994,372.130005,365.540009,372.119995,358.758667,605400


Quick inspection (missingness, dtypes)

In [3]:
print("Dtypes:\n", df_raw.dtypes)
print("\nMissing values per column:\n", df_raw.isna().sum())


Dtypes:
 date              datetime64[ns]
open                     float64
high                     float64
low                      float64
close                    float64
adjusted_close           float64
volume                     int64
dtype: object

Missing values per column:
 date              0
open              0
high              0
low               0
close             0
adjusted_close    0
volume            0
dtype: int64


Cleaning step 1: drop rows missing critical keys (e.g., date, close)

In [4]:
critical = ["date", "close"]
df1 = drop_missing(df_raw, cols=critical)
print("After drop_missing on critical:", df1.shape)


After drop_missing on critical: (1256, 7)


Cleaning step 2: fill remaining numeric gaps with median

In [5]:
df2, medians = fill_missing_median(df1)  # defaults to all numeric cols
print("After fill_missing_median:", df2.shape)
print("Medians used:", medians)
print("Remaining missing values:\n", df2.isna().sum())

After fill_missing_median: (1256, 7)
Medians used: {'open': 676.375, 'high': 683.5450134277344, 'low': 669.6549987792969, 'close': 678.2850036621094, 'adjusted_close': 666.6448364257812, 'volume': 989600.0}
Remaining missing values:
 date              0
open              0
high              0
low               0
close             0
adjusted_close    0
volume            0
dtype: int64


Cleaning step 3: normalize numeric columns (minimax)

In [7]:
df3, zstats = normalize_data(df2, method="minimax")  # defaults to all numeric cols
print("Normalization stats (first few):", dict(list(zstats.items())[:3]))
df3.head()

Normalization stats (first few): {}


Unnamed: 0,date,open,high,low,close,adjusted_close,volume
0,2020-08-17,376.660004,380.5,376.25,378.51001,364.91925,428200
1,2020-08-18,383.609985,383.980011,376.98999,377.220001,363.675568,466500
2,2020-08-19,380.390015,380.429993,373.959991,374.940002,361.477417,354400
3,2020-08-20,369.540009,373.380005,368.380005,372.170013,358.806976,498200
4,2020-08-21,366.149994,372.130005,365.540009,372.119995,358.758667,605400


Compare original vs cleaned

In [8]:
def quick_compare(orig: pd.DataFrame, clean: pd.DataFrame):
    print("Shapes:", orig.shape, "→", clean.shape)
    orig_na = orig.isna().sum()
    clean_na = clean.isna().sum()
    comp = pd.DataFrame({"orig_na": orig_na, "clean_na": clean_na})
    return comp

compare_na = quick_compare(df_raw, df3)
compare_na

Shapes: (1256, 7) → (1256, 7)


Unnamed: 0,orig_na,clean_na
date,0,0
open,0,0
high,0,0
low,0,0
close,0,0
adjusted_close,0,0
volume,0,0


Save the cleaned dataset into project/data/processed/

In [9]:
PROC_DIR.mkdir(parents=True, exist_ok=True)

stamp = datetime.now().strftime("%Y%m%d-%H%M")
clean_csv  = PROC_DIR / f"asml_cleaned_{stamp}.csv"
clean_parq = PROC_DIR / f"asml_cleaned_{stamp}.parquet"

write_df(df3, clean_csv)
write_df(df3, clean_parq)

clean_csv, clean_parq


(WindowsPath('C:/Users/melin/OneDrive/Desktop/nyu/python/bootcamp_panagiotis_housos/project/data/processed/asml_cleaned_20250818-2218.csv'),
 WindowsPath('C:/Users/melin/OneDrive/Desktop/nyu/python/bootcamp_panagiotis_housos/project/data/processed/asml_cleaned_20250818-2218.parquet'))

Reload & validate

In [10]:
df_csv = read_df(clean_csv, parse_dates=["date"])
df_parq = read_df(clean_parq)

print("Reloaded shapes:", df_csv.shape, df_parq.shape)
print("CSV dtypes:\n", df_csv.dtypes)
print("PARQ dtypes:\n", df_parq.dtypes)

assert df_csv.shape == df3.shape, "CSV shape mismatch"
assert df_parq.shape == df3.shape, "Parquet shape mismatch"
print("Validation OK.")

Reloaded shapes: (1256, 7) (1256, 7)
CSV dtypes:
 date              datetime64[ns]
open                     float64
high                     float64
low                      float64
close                    float64
adjusted_close           float64
volume                     int64
dtype: object
PARQ dtypes:
 date              datetime64[ns]
open                     float64
high                     float64
low                      float64
close                    float64
adjusted_close           float64
volume                     int64
dtype: object
Validation OK.


In [None]:
Assumptions & notes

In [12]:
## Assumptions & Cleaning Notes
- **Critical columns:** `date` and `close` are required; rows missing either are dropped.
- **Fill strategy:** Remaining numeric columns are filled with **median** (robust to outliers).
- **Scaling:** Applied **minimax normalization** to numeric columns for modeling comparability; store stats for reproducibility.
- **Non-numeric columns:** Left unchanged.
- **Reproducibility:** All paths are environment-driven; outputs saved under `project/data/processed/` with timestamps.
- **Risks:** Over-normalization can hinder interpretability; medians may shift over time—consider refit cadence.


SyntaxError: invalid character '—' (U+2014) (3578465075.py, line 7)