# Team EDA Baseline (Full Data + R2 Download)

This notebook starts implementation for shared team EDA with the full Stage-1 dataset.

Scope:
- Validate `scripts/r2.env`
- Run `scripts/download_from_r2.sh`
- Verify downloaded files against manifest
- Load sample parquet partitions for sanity checks before full EDA

## 1) Load Project Dependencies and Paths

In [None]:
from pathlib import Path
import os
import re
import json
import subprocess
from typing import Dict, List, Tuple

import pandas as pd

NOTEBOOK_DIR = Path.cwd()
if NOTEBOOK_DIR.name != "Project_Information":
    for candidate in [Path.cwd(), *Path.cwd().parents]:
        if (candidate / "scripts").exists() and (candidate / "datasets").exists():
            NOTEBOOK_DIR = candidate / "Project_Information" if (candidate / "Project_Information").exists() else candidate
            break

REPO_ROOT = NOTEBOOK_DIR.parent if NOTEBOOK_DIR.name == "Project_Information" else NOTEBOOK_DIR
SCRIPTS_DIR = REPO_ROOT / "scripts"
DEFAULT_DATASET_VERSION = "v1"
DEFAULT_DOWNLOAD_ROOT = REPO_ROOT / "datasets" / DEFAULT_DATASET_VERSION
R2_ENV_PATH = SCRIPTS_DIR / "r2.env"
R2_ENV_EXAMPLE_PATH = SCRIPTS_DIR / "r2.env.example"
OUTPUT_DIR = NOTEBOOK_DIR / "outputs" / "team_eda_baseline"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Repo root: {REPO_ROOT}")
print(f"Scripts dir: {SCRIPTS_DIR}")
print(f"Default download root: {DEFAULT_DOWNLOAD_ROOT}")
print(f"R2 env file: {R2_ENV_PATH}")
print(f"Output dir: {OUTPUT_DIR}")

Repo root: /Users/leonschmidt/Projekte/Machine_Learning_Spotify/Git_Project/ML_Group_AB
Scripts dir: /Users/leonschmidt/Projekte/Machine_Learning_Spotify/Git_Project/ML_Group_AB/scripts
Default download root: /Users/leonschmidt/Projekte/Machine_Learning_Spotify/Git_Project/ML_Group_AB/dataset_downloads/v1
R2 env file: /Users/leonschmidt/Projekte/Machine_Learning_Spotify/Git_Project/ML_Group_AB/scripts/r2.env
Output dir: /Users/leonschmidt/Projekte/Machine_Learning_Spotify/Git_Project/ML_Group_AB/Project_Information/outputs/team_eda_baseline


## 2) Parse and Validate `scripts/r2.env` Configuration

In [2]:
REQUIRED_R2_KEYS = [
    "R2_ENDPOINT",
    "R2_BUCKET",
    "AWS_ACCESS_KEY_ID",
    "AWS_SECRET_ACCESS_KEY",
    "DATASET_VERSION",
]


def _strip_balanced_quotes(value: str) -> str:
    if len(value) >= 2 and ((value[0] == '"' and value[-1] == '"') or (value[0] == "'" and value[-1] == "'")):
        return value[1:-1]
    return value


def parse_env_file(env_path: Path) -> Tuple[Dict[str, str], List[str]]:
    parsed: Dict[str, str] = {}
    errors: List[str] = []

    if not env_path.exists():
        errors.append(f"Missing env file: {env_path}")
        return parsed, errors

    for line_no, raw_line in enumerate(env_path.read_text(encoding="utf-8").splitlines(), start=1):
        line = raw_line.strip()
        if not line or line.startswith("#"):
            continue
        if "=" not in line:
            errors.append(f"Line {line_no}: missing '=' separator")
            continue

        key, value = line.split("=", 1)
        key = key.strip()
        value = value.strip()

        if not re.fullmatch(r"[A-Za-z_][A-Za-z0-9_]*", key):
            errors.append(f"Line {line_no}: invalid key '{key}'")
            continue

        dbl_quote_count = value.count('"')
        sgl_quote_count = value.count("'")
        if dbl_quote_count % 2 != 0 or sgl_quote_count % 2 != 0:
            errors.append(
                f"Line {line_no}: malformed quotes for key '{key}' -> {value!r}"
            )
            continue

        normalized = _strip_balanced_quotes(value).strip()

        if normalized.endswith('"') or normalized.endswith("'"):
            errors.append(
                f"Line {line_no}: value for '{key}' appears to have a trailing quote -> {value!r}"
            )

        parsed[key] = normalized

    missing = [k for k in REQUIRED_R2_KEYS if not parsed.get(k)]
    if missing:
        errors.append(f"Missing required keys: {missing}")

    return parsed, errors


r2_config, r2_config_errors = parse_env_file(R2_ENV_PATH)

if r2_config_errors:
    print("❌ r2.env validation failed:")
    for err in r2_config_errors:
        print(f"  - {err}")
    if R2_ENV_EXAMPLE_PATH.exists():
        print(f"\nTemplate available at: {R2_ENV_EXAMPLE_PATH}")
else:
    print("✅ r2.env validation passed")
    display(pd.DataFrame([r2_config]).T.rename(columns={0: "value"}))

✅ r2.env validation passed


Unnamed: 0,value
R2_ENDPOINT,https://a96b93c5d97cddb48fc674255fb687c7.r2.cl...
R2_BUCKET,ml-group-ab-datasets
AWS_ACCESS_KEY_ID,fa162d7d8a7a374608e16c3858dfb6f6
AWS_SECRET_ACCESS_KEY,c810c10fdc699b06aa6d561f1d052387ec79771ea972db...
DATASET_VERSION,v1


## 3) Implement Safe Environment Variable Export for Shell Commands

In [3]:
def build_sanitized_env(base_env: Dict[str, str], overrides: Dict[str, str] | None = None) -> Dict[str, str]:
    if r2_config_errors:
        raise ValueError("Cannot build environment: r2.env is invalid.")

    env = dict(os.environ)
    for key in REQUIRED_R2_KEYS:
        env[key] = str(base_env[key]).strip()

    if overrides:
        for k, v in overrides.items():
            if v is not None:
                env[k] = str(v)

    return env


def run_bash_script(script_path: Path, env: Dict[str, str], cwd: Path, timeout: int = 3600) -> subprocess.CompletedProcess:
    if not script_path.exists():
        raise FileNotFoundError(f"Missing script: {script_path}")

    cmd = ["bash", str(script_path)]
    result = subprocess.run(
        cmd,
        cwd=str(cwd),
        env=env,
        capture_output=True,
        text=True,
        timeout=timeout,
    )

    print("--- stdout ---")
    print(result.stdout[-4000:] if result.stdout else "<empty>")
    print("--- stderr ---")
    print(result.stderr[-4000:] if result.stderr else "<empty>")
    print(f"Exit code: {result.returncode}")

    if result.returncode != 0:
        raise RuntimeError(
            "Download script failed. Check stdout/stderr above. "
            "For permission-related preflight errors, try SKIP_R2_PREFLIGHT=1."
        )

    return result

print("✅ Shell helper ready")

✅ Shell helper ready


## 4) Run Dataset Download Script with Version Overrides

This cell is safe by default (`RUN_DOWNLOAD = False`). Set it to `True` when ready.

In [None]:
DOWNLOAD_SCRIPT = SCRIPTS_DIR / "download_from_r2.sh"
DATASET_VERSION = r2_config.get("DATASET_VERSION", DEFAULT_DATASET_VERSION) if not r2_config_errors else DEFAULT_DATASET_VERSION
DOWNLOAD_ROOT = REPO_ROOT / "datasets" / DATASET_VERSION

RUN_DOWNLOAD = False
RUN_OVERRIDE_EXAMPLE = False

print(f"Default DATASET_VERSION: {DATASET_VERSION}")
print(f"Default DOWNLOAD_ROOT: {DOWNLOAD_ROOT}")

if RUN_DOWNLOAD:
    env_default = build_sanitized_env(
        r2_config,
        overrides={
            "DATASET_VERSION": DATASET_VERSION,
            "DOWNLOAD_ROOT": str(DOWNLOAD_ROOT),
        },
    )
    run_bash_script(DOWNLOAD_SCRIPT, env=env_default, cwd=REPO_ROOT)
else:
    print("Skipped default run. Set RUN_DOWNLOAD=True to execute.")

if RUN_OVERRIDE_EXAMPLE:
    override_version = "v1"
    override_root = REPO_ROOT / "datasets" / f"{override_version}_override_demo"
    env_override = build_sanitized_env(
        r2_config,
        overrides={
            "DATASET_VERSION": override_version,
            "DOWNLOAD_ROOT": str(override_root),
        },
    )
    run_bash_script(DOWNLOAD_SCRIPT, env=env_override, cwd=REPO_ROOT)
else:
    print("Skipped override example. Set RUN_OVERRIDE_EXAMPLE=True to execute.")

## 5) Load Sample Parquet Partitions for Sanity Checks

In [4]:
full_root = active_download_root / "full"
if not full_root.exists():
    raise FileNotFoundError(f"Full dataset folder not found: {full_root}")

parquet_files = sorted(full_root.rglob("*.parquet"))
if not parquet_files:
    raise FileNotFoundError(f"No parquet files found under: {full_root}")

MAX_SAMPLE_FILES = 4
sample_files = parquet_files[:MAX_SAMPLE_FILES]

print(f"Total parquet files in full dataset: {len(parquet_files)}")
print(f"Sampling first {len(sample_files)} files for sanity checks")
for f in sample_files:
    print(f"  - {f.relative_to(active_download_root)}")

sample_df = pd.concat([pd.read_parquet(fp) for fp in sample_files], ignore_index=True)

print("\nSample dataframe shape:", sample_df.shape)
print("Sample dataframe memory usage (MB):", round(sample_df.memory_usage(deep=True).sum() / 1_000_000, 2))

null_ratio = (
    sample_df.isna()
    .mean()
    .sort_values(ascending=False)
    .rename("null_ratio")
    .reset_index(names="column")
)

display(sample_df.head(10))
display(sample_df.dtypes.rename("dtype").reset_index(names="column").head(40))
display(null_ratio.head(20))

sample_df.head(200).to_csv(OUTPUT_DIR / "sample_preview_200_rows.csv", index=False)
null_ratio.to_csv(OUTPUT_DIR / "sample_null_ratio.csv", index=False)

print(f"Saved sample preview to: {OUTPUT_DIR / 'sample_preview_200_rows.csv'}")
print(f"Saved null ratio summary to: {OUTPUT_DIR / 'sample_null_ratio.csv'}")

NameError: name 'active_download_root' is not defined

## 6) Initial EDA Starter (Basic Team Baseline)

This uses the sampled dataframe (`sample_df`) to provide quick baseline insights before full-scale EDA.

In [None]:
candidate_numeric = [c for c in ["streams", "rank", "af_danceability", "af_energy", "af_valence", "af_tempo"] if c in sample_df.columns]
candidate_categorical = [c for c in ["region", "chart", "artist", "title", "year"] if c in sample_df.columns]

print("Numeric columns used:", candidate_numeric)
print("Categorical columns used:", candidate_categorical)

if candidate_numeric:
    display(sample_df[candidate_numeric].describe().T)

for col in [c for c in ["region", "chart", "year"] if c in sample_df.columns]:
    top_counts = sample_df[col].value_counts(dropna=False).head(15).rename("count").reset_index(names=col)
    print(f"\nTop values for {col}:")
    display(top_counts)
    top_counts.to_csv(OUTPUT_DIR / f"sample_top_{col}.csv", index=False)

if {"year", "streams"}.issubset(sample_df.columns):
    yearly_streams = sample_df.groupby("year", dropna=False)["streams"].agg(["count", "mean", "median", "sum"]).reset_index()
    display(yearly_streams)
    yearly_streams.to_csv(OUTPUT_DIR / "sample_yearly_streams_summary.csv", index=False)

try:
    import matplotlib.pyplot as plt

    if "streams" in sample_df.columns:
        ax = sample_df["streams"].dropna().clip(upper=sample_df["streams"].quantile(0.99)).plot(
            kind="hist", bins=50, figsize=(8, 4), title="Streams distribution (clipped at 99th percentile)"
        )
        ax.set_xlabel("streams")
        plt.tight_layout()
        plt.show()
except Exception as exc:
    print(f"Plot skipped (matplotlib unavailable or plotting issue): {exc}")

print(f"Saved starter EDA outputs to: {OUTPUT_DIR}")