# Dataset exploration
This notebook runs the project pipeline steps in order:
1. fetch raw data
2. extract structure
3. compute features
4. build dataset. 

It shows mocked process of dataset extraction.

In [3]:
import pandas as pd

from config_recommendation_ml.data.build_dataset import build_dataset
from config_recommendation_ml.data.compute_features import compute_features
from config_recommendation_ml.data.extract_structure import extract_structure
from config_recommendation_ml.data.fetch_raw import fetch_raw
from config_recommendation_ml.utils.config import load_config
from config_recommendation_ml.utils.paths import DATA_DIR, PROJECT_ROOT

print("[Notebook] Project root:", PROJECT_ROOT)
print("[Notebook] Data directory:", DATA_DIR)

data_cfg = load_config("data")
seeds_cfg = load_config("seeds")

print("[Notebook] Configs loaded:")
print(data_cfg)
print(seeds_cfg)

print("[Notebook] Fetching raw data...")
fetch_raw()

print("[Notebook] Extracting structure...")
extract_structure()

print("[Notebook] Computing features...")
compute_features()

print("[Notebook] Building dataset...")
build_dataset()

print("[Notebook] Pipeline completed (or attempted).")

[Notebook] Project root: /workspaces/config-recommendation-ml
[Notebook] Data directory: /workspaces/config-recommendation-ml/data
[Notebook] Configs loaded:
{'dataset_filename': 'dataset.csv', 'overwrite': False}
{'data_seed': 90, 'sampling_seed': 90, 'training_seed': 90, 'evaluation_seed': 90, 'deterministic': {'force_hash_sorting': True, 'enforce_timestamp_free_ops': True}}
[Notebook] Fetching raw data...
[fetch_raw] Saved mock raw metadata to /workspaces/config-recommendation-ml/data/raw/raw_metadata.json
[Notebook] Extracting structure...
[extract_structure] Saved mock structure to /workspaces/config-recommendation-ml/data/interim/structure.json
[Notebook] Computing features...
[compute_features] Saved mock features to /workspaces/config-recommendation-ml/data/interim/features.json
[Notebook] Building dataset...
[build_dataset] Mock dataset saved to /workspaces/config-recommendation-ml/data/processed/dataset.csv
[Notebook] Pipeline completed (or attempted).


In [4]:
dataset_file = DATA_DIR / data_cfg.get("dataset_filename", "dataset.csv")
dataset_file.parent.mkdir(parents=True, exist_ok=True)

if not dataset_file.exists():
    print(
        "[Notebook] Dataset not found after pipeline. Check previous cells for errors."
    )
else:
    print(f"[Notebook] Dataset exists at {dataset_file}")

    df = pd.read_csv(dataset_file)
    display(df.head())
    print(df.describe())

    if "stars" in df.columns:
        df["stars_scaled"] = df["stars"] / df["stars"].max()
        df.hist(column="stars_scaled")

[Notebook] Dataset not found after pipeline. Check previous cells for errors.
