In [9]:
import numpy as np
from scipy.io import loadmat
import h5py

def _to_numeric_array(x):
    """Convert MATLAB/HDF5-backed values to numeric numpy arrays (1D)."""
    if x is None:
        return np.array([], dtype=float)
    arr = np.asarray(x)
    try:
        if arr.shape == ():
            arr = np.asarray(arr.item())
    except Exception:
        pass
    if getattr(arr, 'dtype', None) is not None and (arr.dtype == object or getattr(arr.dtype, 'kind', None) == 'O'):
        flat = np.atleast_1d(arr).ravel()
        out = []
        for v in flat:
            try:
                if isinstance(v, (bytes, bytearray)):
                    v = v.decode(errors='ignore')
                out.append(float(v))
            except Exception:
                out.append(np.nan)
        return np.asarray(out, dtype=float)
    try:
        return arr.astype(float).ravel()
    except Exception:
        return np.asarray(arr).ravel()


def _read_subj_wins(path, field='Subj_Wins', sample_limit=None):
    """Read per-window data from a MAT v7.3 'Subj_Wins' group without dereferencing the whole file.

    Extracts numeric Age, Gender, Height, and Weight per-window when available. Missing values
    are returned as np.nan (not 0) to avoid masking missingness.
    """
    signals = []
    sbps = []
    dbps = []
    ages = []
    genders = []
    heights = []
    weights = []
    with h5py.File(path, 'r') as f2:
        if field not in f2:
            raise KeyError(f'{field} not found in {path}')
        sw = f2[field]
        ppg = sw.get('PPG_Raw')
        if ppg is None:
            raise KeyError('PPG_Raw not found under ' + field)
        n = ppg.shape[1] if getattr(ppg, 'ndim', 0) > 1 else ppg.shape[0]
        if sample_limit is not None:
            n = min(int(n), int(sample_limit))
        for i in range(int(n)):
            ref = ppg[0, i] if getattr(ppg, 'ndim', 0) == 2 and ppg.shape[0] == 1 else ppg[i]
            try:
                sig = f2[ref][()] if isinstance(ref, h5py.Reference) else ref
            except Exception:
                sig = ref
            sig = np.asarray(sig).ravel()
            signals.append(sig)

            def _get_field_val(name):
                d = sw.get(name)
                if d is None:
                    return np.nan
                ref2 = d[0, i] if getattr(d, 'ndim', 0) == 2 and d.shape[0] == 1 else d[i]
                try:
                    val = f2[ref2][()]
                except Exception:
                    val = ref2
                arr = np.asarray(val)
                # If single-element array or scalar, extract the scalar safely
                if arr.size == 1:
                    try:
                        return float(arr.item())
                    except Exception:
                        try:
                            return float(str(arr.item()))
                        except Exception:
                            return np.nan
                # Fallback: try first element
                try:
                    return float(arr.ravel()[0])
                except Exception:
                    try:
                        s = arr.astype(str).tolist()
                        return float(s)
                    except Exception:
                        return np.nan

            sbps.append(_get_field_val('SegSBP'))
            dbps.append(_get_field_val('SegDBP'))
            ages.append(_get_field_val('Age'))
            genders.append(_get_field_val('Gender'))
            heights.append(_get_field_val('Height'))
            weights.append(_get_field_val('Weight'))

    # Stack signals into (n, L) when possible, else object array
    try:
        sigs = np.vstack([s for s in signals])
    except Exception:
        sigs = np.array(signals, dtype=object)

    sbp_arr = np.array([float(s) if np.asarray(s).size > 0 and not np.isnan(np.asarray(s)).all() else np.nan for s in sbps], dtype=float)
    dbp_arr = np.array([float(d) if np.asarray(d).size > 0 and not np.isnan(np.asarray(d)).all() else np.nan for d in dbps], dtype=float)
    age_arr = np.array([float(a) if np.asarray(a).size > 0 and not np.isnan(np.asarray(a)).all() else np.nan for a in ages], dtype=float)

    def _gdecode(v):
        """Decode gender value to 1.0 (M) or 0.0 (F), or np.nan if unknown.

        Handles bytes/str ('M'/'F') and numeric encodings including ASCII codes (77='M',70='F').
        """
        try:
            if v is None:
                return np.nan
            if isinstance(v, (bytes, bytearray)):
                s = v.decode(errors='ignore').strip()
                if s == '':
                    return np.nan
                return 1.0 if s.upper() == 'M' else 0.0 if s.upper() == 'F' else np.nan
            if isinstance(v, str):
                s = v.strip()
                if s == '':
                    return np.nan
                return 1.0 if s.upper() == 'M' else 0.0 if s.upper() == 'F' else np.nan
            # numeric
            iv = float(v)
            if iv in (1.0, 0.0):
                return iv
            # Common ASCII codes: 77 -> 'M', 70 -> 'F'
            if int(iv) in (77, 70):
                ch = chr(int(iv))
                return 1.0 if ch.upper() == 'M' else 0.0 if ch.upper() == 'F' else np.nan
            # If numeric but not a known code, treat as unknown
            return np.nan
        except Exception:
            return np.nan

    gender_arr = np.array([_gdecode(g) for g in genders], dtype=float)
    height_arr = np.array([float(h) if np.asarray(h).size > 0 and not np.isnan(np.asarray(h)).all() else np.nan for h in heights], dtype=float)
    weight_arr = np.array([float(w) if np.asarray(w).size > 0 and not np.isnan(np.asarray(w)).all() else np.nan for w in weights], dtype=float)

    demographics = np.column_stack([age_arr, gender_arr, height_arr, weight_arr])
    return sigs, sbp_arr, dbp_arr, demographics

In [8]:
# Additional HDF5 / MATLAB v7.3 helper (lightweight fallback)
import h5py
from scipy.io import loadmat

def _to_numeric_array_simple(x):
    """Simple coercion helper for small helpers in preprocessing notebook."""
    import numpy as _np
    if x is None:
        return _np.array([], dtype=float)
    try:
        arr = _np.asarray(x)
        return arr.astype(float).ravel()
    except Exception:
        try:
            return _np.array([float(v) for v in _np.atleast_1d(x).flat])
        except Exception:
            return _np.asarray(x)

def _read_subj_wins_simple(path, field='Subj_Wins', sample_limit=None):
    """Lazy per-window reader for MAT v7.3 Subj_Wins groups."""
    import numpy as _np
    with h5py.File(path, 'r') as f:
        if field not in f:
            raise KeyError(f'{field} not found in {path}')
        sw = f[field]
        ppg = sw.get('PPG_Raw')
        if ppg is None:
            raise KeyError('PPG_Raw not found under ' + field)
        n = ppg.shape[1] if getattr(ppg, 'ndim', 0) > 1 else ppg.shape[0]
        if sample_limit is not None:
            n = min(n, int(sample_limit))
        signals = []
        sbps = []
        dbps = []
        demos = []
        for i in range(int(n)):
            ref = ppg[0, i] if getattr(ppg, 'ndim', 0) == 2 and ppg.shape[0] == 1 else ppg[i]
            try:
                sig = f[ref][()] if isinstance(ref, h5py.Reference) else ref
            except Exception:
                sig = ref
            sig = _np.asarray(sig).ravel()
            signals.append(sig)
            def _g(name):
                d = sw.get(name)
                if d is None:
                    return _np.nan
                ref2 = d[0, i] if getattr(d, 'ndim', 0) == 2 and d.shape[0] == 1 else d[i]
                try:
                    return float(_np.asarray(f[ref2]).squeeze())
                except Exception:
                    try:
                        return float(_np.asarray(ref2))
                    except Exception:
                        return _np.nan
            sbps.append(_g('SegSBP'))
            dbps.append(_g('SegDBP'))
            age = _g('Age')
            gender = _g('Gender')
            height = _g('Height')
            weight = _g('Weight')
            demos.append([age, gender, height, weight])
        try:
            signals_arr = _np.vstack([_np.atleast_1d(s) for s in signals])
        except Exception:
            signals_arr = _np.array(signals, dtype=object)
        return signals_arr, _np.array(sbps, dtype=float), _np.array(dbps, dtype=float), _np.array(demos, dtype=float)

In [7]:
# for systolic blood pressure only
def build_dataset_SBPLabel(Path, FieldName="Subsets"):
    """Build dataset (signals, SBP labels, demographics) for a given MAT file.

    Tries scipy.io.loadmat first; if it fails for v7.3/HDF5 files, falls back
    to the h5py per-window reader `_read_subj_wins`.
    """
    try:
        data = loadmat(Path, squeeze_me=True, struct_as_record=False)
        subset = data[FieldName]
        Signals = subset.Signals
        SBPLabels = _to_numeric_array(getattr(subset, 'SBP', None))
        Age = _to_numeric_array(getattr(subset, 'Age', None))
        Gender = _to_numeric_array(getattr(subset, 'Gender', None))
        Height = _to_numeric_array(getattr(subset, 'Height', None))
        Weight = _to_numeric_array(getattr(subset, 'Weight', None))
        Demographics = np.column_stack([Age, Gender, Height, Weight])
        return Signals, SBPLabels, Demographics
    except Exception as e:
        msg = str(e).lower()
        if 'hdf' in msg or '7.3' in msg or 'h5py' in msg or 'hdf5' in msg:
            # Use lazy per-window reader for Subj_Wins (faster and memory-friendly)
            sigs, sbp_arr, dbp_arr, demographics = _read_subj_wins(Path, field='Subj_Wins')
            Signals = sigs
            SBPLabels = sbp_arr
            return Signals, SBPLabels, demographics
        else:
            raise

In [None]:
# for diastolic blood pressure only
def build_dataset_DBPLabel(Path, FieldName="Subsets"):
    """Build dataset (signals, DBP labels, demographics) for a given MAT file.

    Uses the same logic as `build_dataset_SBPLabel` but returns DBP labels.
    """
    try:
        data = loadmat(Path, squeeze_me=True, struct_as_record=False)
        subset = data[FieldName]
        Signals = subset.Signals
        DBPLabels = _to_numeric_array(getattr(subset, 'DBP', None))
        Age = _to_numeric_array(getattr(subset, 'Age', None))
        Gender = _to_numeric_array(getattr(subset, 'Gender', None))
        Height = _to_numeric_array(getattr(subset, 'Height', None))
        Weight = _to_numeric_array(getattr(subset, 'Weight', None))
        Demographics = np.column_stack([Age, Gender, Height, Weight])
        return Signals, DBPLabels, Demographics
    except Exception as e:
        msg = str(e).lower()
        if 'hdf' in msg or '7.3' in msg or 'h5py' in msg or 'hdf5' in msg:
            sigs, sbp_arr, dbp_arr, demographics = _read_subj_wins(Path, field='Subj_Wins')
            Signals = sigs
            DBPLabels = dbp_arr
            return Signals, DBPLabels, demographics
        else:
            raise

In [6]:
import os
import pandas as pd

processed_dir = '../data/processed'
if not os.path.exists(processed_dir) or not os.listdir(processed_dir):
    print("‚ùå No processed data found. Please:")
    print("1. Run the data loader script to download PulseDB dataset")
    print("2. Place processed .mat files in data/processed/")

else:

    mat_files = [f for f in os.listdir(processed_dir) if f.endswith('.mat')]
    print(f"üìÅ Found {len(mat_files)} MATLAB files: {mat_files}")
    
    if mat_files:
        file_path = os.path.join(processed_dir, mat_files[0])
        print(f"\nüîÑ Processing: {file_path}")
        
        try:
            signals_sbp, sbp_labels, demographics_sbp = build_dataset_SBPLabel(file_path)
            print(f"‚úÖ SBP Dataset loaded:")
            print(f"   - Signals shape: {signals_sbp.shape}")
            print(f"   - SBP labels shape: {sbp_labels.shape}")
            print(f"   - Demographics shape: {demographics_sbp.shape}")
            try:
                print(f"   - SBP range: {sbp_labels.min():.1f} - {sbp_labels.max():.1f} mmHg")
            except Exception:
                pass
            
            signals_dbp, dbp_labels, demographics_dbp = build_dataset_DBPLabel(file_path)
            print(f"‚úÖ DBP Dataset loaded:")
            print(f"   - Signals shape: {signals_dbp.shape}")
            print(f"   - DBP labels shape: {dbp_labels.shape}")
            print(f"   - Demographics shape: {demographics_dbp.shape}")
            try:
                print(f"   - DBP range: {dbp_labels.min():.1f} - {dbp_labels.max():.1f} mmHg")
            except Exception:
                pass
            
            np.save(os.path.join(processed_dir, 'signals_sbp.npy'), signals_sbp)
            np.save(os.path.join(processed_dir, 'sbp_labels.npy'), sbp_labels)
            np.save(os.path.join(processed_dir, 'demographics_sbp.npy'), demographics_sbp)
            
            np.save(os.path.join(processed_dir, 'signals_dbp.npy'), signals_dbp)
            np.save(os.path.join(processed_dir, 'dbp_labels.npy'), dbp_labels)
            np.save(os.path.join(processed_dir, 'demographics_dbp.npy'), demographics_dbp)
            
            print(f"\nüíæ Processed data saved as .npy files for faster loading")
            
        except Exception as e:
            print(f"‚ùå Error processing data: {e}")
            print("Please check your MATLAB file structure and field names")
    else:
        print("‚ùå No .mat files found in processed directory")

üìÅ Found 4 MATLAB files: ['p000160.mat', 'p000333.mat', 'p001038.mat', 'p001840.mat']

üîÑ Processing: ../data/processed/p000160.mat
‚úÖ SBP Dataset loaded:
   - Signals shape: (82, 1250)
   - SBP labels shape: (82,)
   - Demographics shape: (82, 4)
   - SBP range: 114.5 - 135.8 mmHg
‚úÖ DBP Dataset loaded:
   - Signals shape: (82, 1250)
   - DBP labels shape: (82,)
   - Demographics shape: (82, 4)
   - DBP range: 54.7 - 79.9 mmHg

üíæ Processed data saved as .npy files for faster loading
