# UFPA DSP workflow notebook

This notebook contains four parts. Each part implements the functions from the corresponding script so you can run everything inline without importing the `src` modules.

Parts:
1. simple_batch_processing — recursive file search, windowed processing and visualization
2. display_histograms — compute statistics and save per-file histograms
3. all_files_histogram — compute an efficient global histogram across many files
4. create_pngs — create PNG histograms preserving folder hierarchy

Run the setup cell next to make sure paths are correct for your environment.

In [None]:
# Setup: ensure notebook can access project folders and show installed libs (optional)
import sys
from pathlib import Path
repo_root = Path('..').resolve() if Path('.').name == 'notebooks' else Path('.').resolve()
print('Repository root ->', repo_root)

# Optionally, add src to path if you want to import later: (not required for inline functions)
# sys.path.insert(0, str(repo_root / 'src'))

## Part 1 — simple_batch_processing (inline)
Functions: `processing_file`, `plot_wave_and_energy`, `process_folder`

In [None]:
import librosa # for audio file reading
import numpy as np
import glob # for file searching
import os
import matplotlib.pyplot as plt
from tqdm import tqdm # allows progress bars on loops 

def processing_file(file, N=100):
    """Read `file` and compute energy per window of N samples.

    Returns:
        y: waveform array (mono)
        t: time vector for waveform
        e_segments: list of energy values per segment
        t_segments: center times for each segment
    """
    y, sr = librosa.load(file, sr=None, mono=True)
    Ts = 1 / sr
    t = np.arange(len(y)) * Ts
    S = int(np.floor(len(y) / N))
    e_segments = []
    t_segments = (np.arange(S) + 0.5) * N / sr
    for i in range(S):
        seg = y[i * N:(i * N + N)]
        e_i = np.sum(seg * seg)
        e_segments.append(e_i)
    print(f"Processed {os.path.basename(file)} → {len(e_segments)} segments")
    return y, t, e_segments, t_segments

def plot_wave_and_energy(y, t, e_segments, t_segments, filename, outdir='figures'):
    """Plot waveform and per-segment energy and save figure to `outdir`.

    Parameters:
        y, t, e_segments, t_segments: outputs from `processing_file`
        filename: base filename (no extension) to use when saving
        outdir: directory where the figure will be saved
    """
    plt.figure(figsize=(10, 6))
    plt.suptitle(f"{filename}", fontsize=10)

    plt.subplot(211)
    plt.plot(t, y)
    plt.title("Waveform")
    plt.xlabel("Time(s)")
    plt.ylabel("Amplitude")
    plt.autoscale(tight=False)
    plt.grid(True)

    plt.subplot(212)
    plt.plot(t_segments, e_segments)
    plt.title("Energy per segment")
    plt.xlabel("Time(s)")
    plt.ylabel("Energy")
    plt.autoscale(tight=False)
    plt.grid(True)

    plt.tight_layout()
    os.makedirs(outdir, exist_ok=True)
    plt.savefig(os.path.join(outdir, f"{filename}.png"), dpi=150)
    plt.close()

def process_folder(input_dir, N=100, limit=None, outdir='figures'):
    """Recursively process audio files under `inputdir`.

    Parameters:
        inputdir: root folder to search for .wav files
        N: window size in samples
        limit: optional maximum number of files to process
        outdir: directory where figures will be saved
    """
    files = glob.glob(os.path.join(input_dir, '**', '*.wav'), recursive=True)
    if limit:
        files = files[:limit]
    for file in tqdm(files, desc="Processing files"):
        filename = os.path.splitext(os.path.split(file)[1])[0]
        y, t, e_segments, t_segments = processing_file(file, N)
        plot_wave_and_energy(y, t, e_segments, t_segments, filename, outdir=outdir)

# Example: process_folder('data/UrbanSound8k/fold1', N=100, limit=5)  # uncomment to run

## Part 2 — display_histograms (inline)
Functions: 'generate_histogram', 'compute_stats', 

In [None]:
import librosa
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import glob


def generate_histogram(data, filename, outdir='histograms', bins=100):
    """Create and save an amplitude histogram for a numpy array `data`.

    Parameters:
        data: 1D numpy array containing audio samples (mono)
        filename: base filename (no extension) used for saving
        outdir: directory where histogram figure will be saved
        bins: number of bins for histogram
    Returns:
        output_path (str): path to the saved PNG file
    """
    plt.figure()
    plt.hist(data, bins=bins, color="steelblue", alpha=0.7)
    plt.title(f"Amplitude histogram - {filename}")
    plt.xlabel("Amplitude")
    plt.ylabel("Count")
    plt.grid(True)
    os.makedirs(outdir, exist_ok=True)
    output_path = os.path.join(outdir, f"{filename}_hist.png")
    plt.savefig(output_path, dpi=150)
    plt.close()
    return output_path


def compute_stats(data):
    """Compute basic amplitude statistics for a numpy array `data`.

    Parameters:
        data: 1D numpy array containing audio samples (mono)
    Returns:
        stats (np.array): array with [min_amp, max_amp, mean_amp]
    """
    min_amp = np.min(data)
    max_amp = np.max(data)
    mean_amp = np.mean(np.abs(data))
    return np.array([min_amp, max_amp, mean_amp])


def process_stats_file(file, outdir='histograms', bins=100):
    """Read `file`, compute amplitude statistics and create amplitude histogram.

    Parameters:
       file: input audio file path
       outdir: directory where histogram figure will be saved
       bins: number of bins for histogram
    Returns:
       info (np.array): array with [file, sr, duration, min_amp, max_amp, mean_amp, hist_path]
    """
    data, sr = librosa.load(file, sr=None, mono=True)
    Ts = 1 / sr if sr else None
    duration = len(data) * Ts if sr else None
    stats = compute_stats(data)


    filename = os.path.splitext(os.path.basename(file))[0] # extract filename without extension
    outpath = generate_histogram(data, filename, outdir=outdir, bins=bins) # create histogram

    info = np.array([file, sr, duration, stats[0], stats[1], stats[2], outpath])
    return info


def array_to_dataframe(array_list):
    """Convert a list of numpy arrays to a pandas DataFrame.

    Parameters:
        array_list: list of numpy arrays, each representing a row
    Returns:
        df: pandas DataFrame with appropriate column names
    """
    columns = ['file', 'sr', 'duration', 'min_amp', 'max_amp', 'mean_amp', 'hist_path']
    df = pd.DataFrame(array_list, columns=columns)
    return df


def run_part2(input_dir, outdir='histograms', bins=100, limit=None, save_csv=True):
    """Process all .wav files in `input_dir`, compute stats and generate histograms.
    Parameters:
        input_dir: root folder to search for .wav files
        outdir: directory where histogram figures will be saved
        bins: number of bins for histogram
        limit: optional maximum number of files to process
        save_csv: whether to save a CSV summary into outdir (stats_summary.csv)
    Returns:
        df: pandas DataFrame with processed file information
    """
    files = glob.glob(os.path.join(input_dir, '**', '*.wav'), recursive=True)
    if limit:
        files = files[:limit]

    rows = [] # list to hold info arrays
    for file in files:
        info = process_stats_file(file, outdir=outdir, bins=bins)
        rows.append(info)

    if not rows: # no files processed
        print('No files processed')
        return pd.DataFrame(columns=['file', 'sr', 'duration', 'min_amp', 'max_amp', 'mean_amp', 'hist_path'])

    df = array_to_dataframe(rows)

    # cast numeric columns (except 'file' and 'hist_path')
    # pd.to_numeric: converts argument to numeric type, coercing errors to NaN
    df['sr'] = pd.to_numeric(df['sr'], errors='coerce')
    df['duration'] = pd.to_numeric(df['duration'], errors='coerce')
    df['min_amp'] = pd.to_numeric(df['min_amp'], errors='coerce')
    df['max_amp'] = pd.to_numeric(df['max_amp'], errors='coerce')
    df['mean_amp'] = pd.to_numeric(df['mean_amp'], errors='coerce')

    if save_csv:
        os.makedirs(outdir, exist_ok=True)
        csv_path = os.path.join(outdir, 'stats_summary.csv')
        df.to_csv(csv_path, index=False)
        print(f"Saved summary -> {csv_path}")

    print(df.head())
    return df


## Part 3 — all_files_histogram (inline)
Functions: `find_global_range`, `extract_all_hist`, `compute_global_histogram`

In [None]:
import numpy as np
import soundfile as sf
import glob
import os
import matplotlib.pyplot as plt

def find_global_range(files):
    vmin, vmax = np.inf, -np.inf
    for file in files:
        data, _ = sf.read(file)
        if data.ndim > 1:
            data = data.mean(axis=1)
        vmin = min(vmin, data.min())
        vmax = max(vmax, data.max())
    return vmin, vmax

def extract_all_hist(files, bins):
    hist_total = np.zeros(len(bins) - 1)
    for file in files:
        data, _ = sf.read(file)
        if data.ndim > 1:
            data = data.mean(axis=1)
        hist, _ = np.histogram(data, bins)
        hist_total += hist
    return hist_total

def run_part3(input_dir='data/UrbanSound8k/fold1', n_intervals=200, outpath='global_hist.png', limit=None):
    files = glob.glob(os.path.join(input_dir, '**', '*.wav'), recursive=True)
    if limit:
        files = files[:limit]
    if not files:
        print('No files found')
        return
    vmin, vmax = find_global_range(files)
    bins = np.linspace(vmin, vmax, n_intervals + 1)
    hist_total = extract_all_hist(files, bins)

    plt.figure()
    plt.bar((bins[:-1] + bins[1:]) / 2, hist_total, width=(bins[1] - bins[0]))
    plt.xlabel("Amplitude")
    plt.ylabel("Count")
    plt.title("Global amplitude histogram")
    plt.savefig(outpath, dpi=150)
    plt.close()

# Example: run_part3('data/UrbanSound8k/fold1', n_intervals=200, limit=50)  # uncomment to run

## Part 4 — create_pngs (inline)
Functions: `make_hist_file`, `process_folder`

In [None]:
import soundfile as sf
import os
import matplotlib.pyplot as plt
import glob

def make_hist_file(file, outputdir, bins=100):
    data, _ = sf.read(file)
    if data.ndim > 1:
        data = data.mean(axis=1)

    plt.hist(data, bins=bins, color="steelblue", alpha=0.7)
    plt.xlabel("Amplitude")
    plt.ylabel("Count")
    plt.grid(True)
    os.makedirs(outputdir, exist_ok=True)
    output_path = os.path.join(outputdir, f"{os.path.splitext(os.path.basename(file))[0]}_hist.png")
    plt.savefig(output_path, dpi=150)
    plt.close()

def process_folder(inputdir, outputdir, bins=100, overwrite=False):
    if os.path.exists(outputdir) and not overwrite:
        raise FileExistsError(f"Output directory '{outputdir}' already exists. Set overwrite=True to overwrite.")
    os.makedirs(outputdir, exist_ok=True)

    files = glob.glob(os.path.join(inputdir, '**', '*.wav'), recursive=True)
    if not files:
        raise FileNotFoundError(f"There are no .wav files in folder '{inputdir}'.")

    for file in files:
        rel_path = os.path.relpath(file, inputdir)
        sub_dir = os.path.join(outputdir, os.path.dirname(rel_path))
        os.makedirs(sub_dir, exist_ok=True)
        make_hist_file(file, sub_dir, bins=bins)

# Example: process_folder('data/for_hist_tests', 'out_histograms')  # uncomment to run

---

## Next steps

1. Optionally remove the inline functions from the notebook and import from `src/` if you want to maintain a single source of truth.
2. Run a small subset of files with `limit=...` to validate everything.
3. I can add a README cell and small smoke tests if you want.