In [1]:
from pathlib import Path
import polars as pl
import numpy as np
import matplotlib.pyplot as plt

try:
    import seaborn as sns
    _HAS_SNS = True
except Exception:
    _HAS_SNS = False


def find_project_root(start=None):
    if start is None:
        start = Path.cwd().resolve()
    for p in [start] + list(start.parents):
        if (p / "data").exists():
            return p
    return start


PROJECT_ROOT = find_project_root()
DATA_INTERIM = PROJECT_ROOT / "data" / "interim"
PROJECT_ROOT, DATA_INTERIM


(PosixPath('/home/lluis/master-thesis/CSISD'),
 PosixPath('/home/lluis/master-thesis/CSISD/data/interim'))

In [2]:

FEATURE_GLOB = "*/features/*svara*features*.parquet"

ID_COLS = {
    "recording_id", "piece_id", "file",
    "svara_id",
    "seg_start_sec", "seg_end_sec", "seg_duration_sec", "seg_n_rows",
}

LABEL_COLS_CANDIDATES = {
    "svara", "prev_svara", "next_svara",
    "section_label",
    "svara_lower", "svara_upper",
}

feature_paths = sorted(DATA_INTERIM.glob(FEATURE_GLOB))
len(feature_paths), feature_paths[:5]

(1,
 [PosixPath('/home/lluis/master-thesis/CSISD/data/interim/srs_v1_bdn_sav/features/srs_v1_bdn_sav_svara_features.parquet')])

In [None]:
# SALT DE CEL·LA
# Carrega i concatena (scan_parquet)

if len(feature_paths) == 0:
    raise FileNotFoundError(
        f"No he trobat parquets amb el patró {FEATURE_GLOB} dins {DATA_INTERIM}. "
        "Revisa el glob o el directori."
    )

scans = [pl.scan_parquet(p) for p in feature_paths]
df = pl.concat(scans, how="vertical_relaxed").collect()
df.shape, df.columns[:20]


In [None]:
# SALT DE CEL·LA
# Assegura recording_id (si falta, el derivem del path)

if "recording_id" not in df.columns:
    rec_ids = []
    for p in feature_paths:
        # data/interim/<recording_id>/features/...
        parts = p.parts
        if "interim" in parts:
            rid = parts[parts.index("interim") + 1]
        else:
            rid = p.parent.parent.name
        rec_ids.append(rid)

    dfs = []
    for p, rid in zip(feature_paths, rec_ids):
        d = pl.read_parquet(p)
        d = d.with_columns(pl.lit(rid).alias("recording_id"))
        dfs.append(d)

    df = pl.concat(dfs, how="vertical_relaxed")

df.select(["recording_id"]).head()


In [None]:
# SALT DE CEL·LA
# Identifica features automàticament

present_id_cols = [c for c in ID_COLS if c in df.columns]
present_label_cols = [c for c in LABEL_COLS_CANDIDATES if c in df.columns]

numeric_cols = [c for c, dt in zip(df.columns, df.dtypes) if dt.is_numeric()]
numeric_features = [c for c in numeric_cols if c not in set(present_id_cols) | set(present_label_cols)]

summary = {
    "n_rows": df.height,
    "n_cols": df.width,
    "n_recordings": df.select(pl.col("recording_id").n_unique()).item(),
    "n_id_cols_present": len(present_id_cols),
    "n_label_cols_present": len(present_label_cols),
    "n_numeric_features": len(numeric_features),
}
summary


In [None]:
# SALT DE CEL·LA
# Recompte per recording

counts_by_rec = (
    df.group_by("recording_id")
      .agg(pl.len().alias("n_segments"))
      .sort("n_segments", descending=True)
)
counts_by_rec


In [None]:
# SALT DE CEL·LA
# Recompte per svara (si existeix)

if "svara" in df.columns:
    counts_by_svara = (
        df.group_by("svara")
          .agg(pl.len().alias("n_segments"))
          .sort("n_segments", descending=True)
    )
    counts_by_svara
else:
    print("No hi ha columna 'svara' al parquet (encara).")


In [None]:
# SALT DE CEL·LA
# Missingness (NaNs + nulls) per feature

if len(numeric_features) == 0:
    print("No hi ha features numèriques detectades.")
else:
    miss = df.select([
        (pl.col(c).is_null().sum() + pl.col(c).is_nan().sum()).alias(c)
        for c in numeric_features
    ]).to_dicts()[0]

    miss_df = (
        pl.DataFrame({
            "feature": list(miss.keys()),
            "missing_count": list(miss.values()),
        })
        .with_columns((pl.col("missing_count") / df.height).alias("missing_frac"))
        .sort("missing_frac", descending=True)
    )

    miss_df.head(20)


In [None]:
# SALT DE CEL·LA
# Features constants (variança 0)

if len(numeric_features) > 0:
    var_map = df.select([pl.col(c).var().alias(c) for c in numeric_features]).to_dicts()[0]

    var_df = (
        pl.DataFrame({"feature": list(var_map.keys()), "variance": list(var_map.values())})
        .sort("variance")
    )

    var_df.head(20)


In [None]:
# SALT DE CEL·LA
# Duplicats (recording_id, svara_id)

if "recording_id" in df.columns and "svara_id" in df.columns:
    dup = (
        df.group_by(["recording_id", "svara_id"])
          .agg(pl.len().alias("n"))
          .filter(pl.col("n") > 1)
          .sort("n", descending=True)
    )
    if dup.height == 0:
        print("OK: no hi ha duplicats (recording_id, svara_id).")
    else:
        print("ATENCIÓ: hi ha duplicats!")
        dup.head(20)
else:
    print("No puc comprovar duplicats: falta recording_id o svara_id.")


In [None]:
# SALT DE CEL·LA
# Distribució de durades de segments

if "seg_duration_sec" in df.columns:
    dur = df["seg_duration_sec"].to_numpy()
    dur = dur[np.isfinite(dur)]

    print({
        "n_finite": int(len(dur)),
        "min": float(np.min(dur)) if len(dur) else np.nan,
        "p05": float(np.quantile(dur, 0.05)) if len(dur) else np.nan,
        "median": float(np.median(dur)) if len(dur) else np.nan,
        "p95": float(np.quantile(dur, 0.95)) if len(dur) else np.nan,
        "max": float(np.max(dur)) if len(dur) else np.nan,
    })

    plt.figure(figsize=(8, 4))
    if _HAS_SNS:
        sns.histplot(dur, bins=50)
    else:
        plt.hist(dur, bins=50)
    plt.title("Segment duration distribution")
    plt.xlabel("seg_duration_sec")
    plt.ylabel("count")
    plt.tight_layout()
    plt.show()
else:
    print("No hi ha columna seg_duration_sec")


In [None]:
# SALT DE CEL·LA
# Quick look: top missingness i top variància

if len(numeric_features) > 0:
    print("--- Missingness (top 15) ---")
    if "miss_df" in globals():
        print(miss_df.head(15))

    print("\n--- Variance (top 15) ---")
    if "var_df" in globals():
        print(var_df.sort("variance", descending=True).head(15))
