In [None]:
%%writefile install_mfa.sh
#!/bin/bash

## a script to install Montreal Forced Aligner (MFA)

root_dir=${1:-/tmp/mfa}
mkdir -p $root_dir
cd $root_dir

# download miniconda3
wget -q --show-progress https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
bash Miniconda3-latest-Linux-x86_64.sh -b -p $root_dir/miniconda3 -f

#install MFA
$root_dir/miniconda3/bin/conda create -n aligner -c conda-forge montreal-forced-aligner -y

echo -e "\n======== DONE =========="
echo -e "\nTo activate MFA, run: source $root_dir/miniconda3/bin/activate aligner"
echo -e "\nTo delete MFA, run: rm -rf $root_dir"
echo -e "\nSee: https://montreal-forced-aligner.readthedocs.io/en/latest/aligning.html to know how to use MFA"

Overwriting install_mfa.sh


In [None]:
INSTALL_DIR="/tmp/mfa"

# Install MFA
!bash ./install_mfa.sh {INSTALL_DIR}

# Fix joblib version
!{INSTALL_DIR}/miniconda3/envs/aligner/bin/pip install "joblib<1.3.0"

# (Optional) Install seaborn to resolve dependency warning
!{INSTALL_DIR}/miniconda3/envs/aligner/bin/pip install seaborn

# Fix matplotlib backend error and run help
!MPLBACKEND=Agg {INSTALL_DIR}/miniconda3/bin/conda run -n aligner mfa align --help



PREFIX=/tmp/mfa/miniconda3
Unpacking payload ...

Installing base environment...

Preparing transaction: ...working... done
Executing transaction: ...working... done
installation finished.
    You currently have a PYTHONPATH environment variable set. This may cause
    unexpected behavior when running the Python interpreter in Miniconda3.
    For best results, please verify that your PYTHONPATH only points to
    directories of packages that are compatible with the Python interpreter
    in Miniconda3: /tmp/mfa/miniconda3
Channels:
 - conda-forge
 - defaults
Platform: linux-64
Collecting package metadata (repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / done
Solving environment: \ | / - \ done

## Package Plan ##

  environment location: /tmp/mfa/miniconda3/envs/aligner

  added / updated specs:
    - montreal-forced-aligner


The foll

In [None]:
# download and unpack ljs dataset
!echo "download and unpack ljs dataset"
!mkdir -p ./ljs; cd ./ljs; wget -q --show-progress https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
!cd ./ljs && echo "Starting extraction..." && tar xjf LJSpeech-1.1.tar.bz2 && echo "Extraction done."


download and unpack ljs dataset
Starting extraction...
Extraction done.


In [None]:
# install sox tool
!sudo apt install -q -y sox
# convert to 16k audio clips
!mkdir ./wav
!echo "normalize audio clips to sample rate of 16k"
!find ./ljs -name "*.wav" -type f -execdir sox --norm=-3 {} -r 16k -c 1 `pwd`/wav/{} \;
!echo "Number of clips" $(ls ./wav/ | wc -l)

Reading package lists...
Building dependency tree...
Reading state information...
The following additional packages will be installed:
  libopencore-amrnb0 libopencore-amrwb0 libsox-fmt-alsa libsox-fmt-base
  libsox3 libwavpack1
Suggested packages:
  libsox-fmt-all
The following NEW packages will be installed:
  libopencore-amrnb0 libopencore-amrwb0 libsox-fmt-alsa libsox-fmt-base
  libsox3 libwavpack1 sox
0 upgraded, 7 newly installed, 0 to remove and 34 not upgraded.
Need to get 617 kB of archives.
After this operation, 1,764 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libopencore-amrnb0 amd64 0.1.5-1 [94.8 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libopencore-amrwb0 amd64 0.1.5-1 [49.1 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 libsox3 amd64 14.4.2+git20190427-2+deb11u2ubuntu0.22.04.1 [240 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 libsox-fmt-alsa

In [None]:
# create transcript files from metadata.csv
lines = open('./ljs/LJSpeech-1.1/metadata.csv', 'r').readlines()
from tqdm.auto import tqdm
for line in tqdm(lines):
  fn, _, transcript = line.strip().split('|')
  ident = fn
  open(f'./wav/{ident}.txt', 'w').write(transcript)

# this is an example transcript for LJ001-0001.wav
!cat ./wav/LJ001-0001.txt

  0%|          | 0/13100 [00:00<?, ?it/s]

Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition

In [None]:
# download a pretrained english acoustic model, and english lexicon
!wget -q --show-progress https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/english.zip
!wget -q --show-progress http://www.openslr.org/resources/11/librispeech-lexicon.txt



In [None]:
# see: https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/pull/480
import re
lexicon = open("librispeech-lexicon.txt").readlines()
sp = re.compile("\s+")
with open("modified_librispeech-lexicon.txt", "w") as f:
    for line in lexicon:
        word, *phonemes = sp.split(line.strip())
        phonemes = " ".join(phonemes)
        f.write(f"{word}\t{phonemes}\n")

In [None]:
# Define INSTALL_DIR again if it's not in scope (it should be from the first cell)
INSTALL_DIR="/tmp/mfa"

!MPLBACKEND=Agg {INSTALL_DIR}/miniconda3/bin/conda run -n aligner \
  mfa align -t ./temp -j 4 ./wav modified_librispeech-lexicon.txt ./english.zip ./ljs_aligned

# output files are at ./ljs_aligned
!echo "See output files at ./ljs_aligned"

 100% ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13,100/100  [ 0:00:00 < 0:00:00 , 2,155 it/s ]
 100% ━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13,100/13,100  [ 0:00:03 < 0:00:00 , ? it/s ]
 100% ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13,100/13,100  [ 0:02:41 < 0:00:00 , 92 it/s ]
 100% ━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13,100/13,100  [ 0:00:09 < 0:00:00 , ? it/s ]
 100% ━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13,100/13,100  [ 0:00:01 < 0:00:00 , ? it/s ]
 100% ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13,100/13,100  [ 0:04:27 < 0:00:00 , 49 it/s ]
 100% ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13,100/13,100  [ 0:02:18 < 0:00:00 , 95 it/s ]
 100% ━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13,100/13,100  [ 0:06:34 < 0:00:00 , 35 it/s ]
 100% ━━━━━━━━━━━━━━━━━━━━━━━━━━ 13,100/13,100  [ 0:01:16 < 0:00:00 , 169 it/s ]
 100% ━━━━━━━━━━━━━━━━━━━━━━━━━━ 13,100/13,100  [ 0:00:20 < 0:00:00 , 704 it/s ]

 INFO     Setting up corpus information...                                      
 INFO     Loading corpus from source files...                                   
 INFO     Found 1 speaker a

In [None]:
import os
import csv
import glob
from tqdm.auto import tqdm # For progress bar

# Define the directory containing the individual transcript (.txt) files
# This should be the same directory as your processed .wav files
source_transcript_dir = "./wav"

# Define the new directory and the output CSV filename
output_dir = "./transcriptions"
output_csv_path = os.path.join(output_dir, "transcriptions.csv")

# 1. Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
print(f"Ensured output directory exists: {output_dir}")

# 2. Find all the .txt files in the source directory
# We sort them to ensure a consistent order (LJSpeech filenames sort naturally)
transcript_files = sorted(glob.glob(os.path.join(source_transcript_dir, "*.txt")))

if not transcript_files:
    print(f"ERROR: No .txt files found in '{source_transcript_dir}'. Please check the path.")
else:
    print(f"Found {len(transcript_files)} transcript files.")

    # 3. Open the CSV file for writing
    # 'newline=""' prevents extra blank rows in the CSV on some systems
    # 'encoding='utf-8'' is good practice for text files
    with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        # 4. Create a CSV writer object
        csv_writer = csv.writer(csvfile)

        # 5. Write the header row
        csv_writer.writerow(["index", "transcription"])

        # 6. Iterate through the sorted transcript files
        # Use enumerate to get a 1-based index easily
        for idx, txt_filepath in enumerate(tqdm(transcript_files, desc="Writing CSV"), start=1):
            try:
                # Extract the filename (like LJ001-0001.txt) for reference if needed
                # filename = os.path.basename(txt_filepath)

                # Read the transcription content from the .txt file
                with open(txt_filepath, 'r', encoding='utf-8') as f_txt:
                    transcription_text = f_txt.read().strip() # Read and remove leading/trailing whitespace

                # Write the index and the transcription to the CSV
                csv_writer.writerow([idx, transcription_text])

            except Exception as e:
                print(f"Warning: Could not process file {txt_filepath}. Error: {e}")

    print(f"\nSuccessfully created transcriptions file at: {output_csv_path}")
    print(f"Total transcriptions written: {len(transcript_files)}")

    # Optional: Display the first few lines of the created CSV file for verification
    print("\n--- First 5 lines of the CSV file ---")
    !head -n 5 {output_csv_path}
    print("-------------------------------------")

Ensured output directory exists: ./transcriptions
Found 13100 transcript files.


Writing CSV:   0%|          | 0/13100 [00:00<?, ?it/s]


Successfully created transcriptions file at: ./transcriptions/transcriptions.csv
Total transcriptions written: 13100

--- First 5 lines of the CSV file ---
index,transcription
1,"Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition"
2,in being comparatively modern.
3,"For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process"
4,"produced the block books, which were the immediate predecessors of the true printed book,"
-------------------------------------


In [None]:
!pip install textgrid

Collecting textgrid
  Downloading TextGrid-1.6.1.tar.gz (9.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: textgrid
  Building wheel for textgrid (setup.py) ... [?25l[?25hdone
  Created wheel for textgrid: filename=TextGrid-1.6.1-py3-none-any.whl size=10146 sha256=64882212932681c42e2b0ffb4a35c836d6db842f07b3a7b2efad0f8aad0ce71e
  Stored in directory: /root/.cache/pip/wheels/7a/c5/96/5e43aa4c640995fbbb0b9a7b98e6007bfd777add3c7e56d70a
Successfully built textgrid
Installing collected packages: textgrid
Successfully installed textgrid-1.6.1


In [None]:
import os
import glob
import textgrid # Requires 'pip install textgrid'
from tqdm.auto import tqdm
import numpy as np # Needed for checking vowel set

# --- Configuration ---
textgrid_dir = "./ljs_aligned"  # Directory with MFA output TextGrids
output_base_dir = "./processed_alignments" # Base directory for new files

# Create specific output directories
output_word_dir = os.path.join(output_base_dir, "words")
output_vowel_dir = os.path.join(output_base_dir, "vowels")
output_syllable_dir = os.path.join(output_base_dir, "syllables_heuristic") # Renamed syllable dir

os.makedirs(output_word_dir, exist_ok=True)
os.makedirs(output_vowel_dir, exist_ok=True)
os.makedirs(output_syllable_dir, exist_ok=True) # Create the new syllable directory

# Define the set of ARPAbet vowels (same as before)
ARPABET_VOWELS = {
    'AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2',
    'AO0', 'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2',
    'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1', 'EY2',
    'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'OW0', 'OW1', 'OW2',
    'OY0', 'OY1', 'OY2', 'UH0', 'UH1', 'UH2', 'UW0', 'UW1', 'UW2',
    'AW', 'AY', 'EH', 'EY', 'IH', 'IY', 'OW', 'OY', 'UH', 'UW',
    'AA', 'AE', 'AH', 'AO', 'ER'
}
# --- End Configuration ---

# Find all TextGrid files
tg_files = sorted(glob.glob(os.path.join(textgrid_dir, "*.TextGrid")))

if not tg_files:
    print(f"ERROR: No .TextGrid files found in '{textgrid_dir}'. Please check the path.")
else:
    print(f"Found {len(tg_files)} TextGrid files. Processing...")

    # Process each TextGrid file
    for tg_filepath in tqdm(tg_files, desc="Processing TextGrids"):
        base_filename = os.path.splitext(os.path.basename(tg_filepath))[0]

        try:
            tg = textgrid.TextGrid.fromFile(tg_filepath)

            # --- 1. Extract Words (No change here) ---
            word_output_path = os.path.join(output_word_dir, f"{base_filename}.words.txt")
            with open(word_output_path, 'w', encoding='utf-8') as f_word:
                words_tier = None
                for tier in tg.tiers:
                    if tier.name.lower() == 'words':
                        words_tier = tier
                        break
                if words_tier:
                    for interval in words_tier:
                        if interval.mark and interval.mark.strip(): # Check mark exists and is not just whitespace
                            f_word.write(f"{interval.minTime:.4f}\t{interval.maxTime:.4f}\t{interval.mark.strip()}\n")
                else:
                    print(f"Warning: 'words' tier not found in {tg_filepath}")

            # --- 2. Extract Vowels AND Heuristic Syllables ---
            vowel_output_path = os.path.join(output_vowel_dir, f"{base_filename}.vowels.txt")
            syllable_output_path = os.path.join(output_syllable_dir, f"{base_filename}.syllables.txt") # Use new dir

            # Find the phones tier
            phones_tier = None
            for tier in tg.tiers:
                if tier.name.lower() == 'phones':
                    phones_tier = tier
                    break

            if phones_tier:
                # Store phone intervals (filtering out silence/empty marks)
                phone_intervals = []
                for interval in phones_tier:
                    mark = interval.mark.strip()
                    if mark: # Only keep intervals with actual phone labels
                        phone_intervals.append({
                            "mark": mark,
                            "xmin": interval.minTime,
                            "xmax": interval.maxTime
                        })

                if not phone_intervals: # Skip if no valid phones found
                     print(f"Warning: No valid phone intervals found in {tg_filepath}")
                     continue

                # Open output files
                with open(vowel_output_path, 'w', encoding='utf-8') as f_vowel, \
                     open(syllable_output_path, 'w', encoding='utf-8') as f_syllable:

                    # Iterate through the stored phone intervals to find vowels
                    for i, current_phone in enumerate(phone_intervals):
                        phone_label = current_phone["mark"]

                        if phone_label in ARPABET_VOWELS:
                            # -- Vowel writing (same as before) --
                            vowel_start = current_phone["xmin"]
                            vowel_end = current_phone["xmax"]
                            f_vowel.write(f"{vowel_start:.4f}\t{vowel_end:.4f}\t{phone_label}\n")

                            # -- Heuristic Syllable Boundary Calculation --
                            syllable_start = vowel_start
                            syllable_end = vowel_end

                            # Check phone BEFORE the vowel
                            if i > 0: # If not the first phone
                                prev_phone = phone_intervals[i-1]
                                # If the previous phone is NOT a vowel (i.e., it's a consonant)
                                if prev_phone["mark"] not in ARPABET_VOWELS:
                                    syllable_start = prev_phone["xmin"] # Extend start to include preceding consonant

                            # Check phone AFTER the vowel
                            if i < len(phone_intervals) - 1: # If not the last phone
                                next_phone = phone_intervals[i+1]
                                # If the next phone is NOT a vowel (i.e., it's a consonant)
                                if next_phone["mark"] not in ARPABET_VOWELS:
                                     syllable_end = next_phone["xmax"] # Extend end to include following consonant

                            # Write the calculated syllable boundaries
                            f_syllable.write(f"{syllable_start:.4f}\t{syllable_end:.4f}\t{phone_label}\n")

            else:
                 print(f"Warning: 'phones' tier not found in {tg_filepath}")

        except Exception as e:
            print(f"Error processing file {tg_filepath}: {e}")
            import traceback
            traceback.print_exc() # Print full error for debugging


    print("\nProcessing complete.")
    print(f"Word files written to: {output_word_dir}")
    print(f"Vowel files written to: {output_vowel_dir}")
    print(f"Syllable (heuristic) files written to: {output_syllable_dir}") # Updated path

    # Optional: Display an example of each created file
    if tg_files: # Check if tg_files is not empty
        example_base = os.path.splitext(os.path.basename(tg_files[0]))[0]
        print(f"\n--- Example Output for {example_base} ---")

        example_word_path = f"{output_word_dir}/{example_base}.words.txt"
        if os.path.exists(example_word_path):
             print(f"\n{example_word_path}:")
             !head -n 3 "{example_word_path}"
        else: print(f"\n{example_word_path} not found.")

        example_vowel_path = f"{output_vowel_dir}/{example_base}.vowels.txt"
        if os.path.exists(example_vowel_path):
             print(f"\n{example_vowel_path}:")
             !head -n 3 "{example_vowel_path}"
        else: print(f"\n{example_vowel_path} not found.")

        example_syllable_path = f"{output_syllable_dir}/{example_base}.syllables.txt"
        if os.path.exists(example_syllable_path):
             print(f"\n{example_syllable_path}:")
             !head -n 3 "{example_syllable_path}"
        else: print(f"\n{example_syllable_path} not found.")

        print("---------------------------------------------")
    else:
        print("\nNo TextGrid files found to show examples.")

Found 13100 TextGrid files. Processing...


Processing TextGrids:   0%|          | 0/13100 [00:00<?, ?it/s]


Processing complete.
Word files written to: ./processed_alignments/words
Vowel files written to: ./processed_alignments/vowels
Syllable (heuristic) files written to: ./processed_alignments/syllables_heuristic

--- Example Output for LJ001-0001 ---

./processed_alignments/words/LJ001-0001.words.txt:
0.0000	0.6800	printing
0.7900	1.0000	in
1.0000	1.1600	the

./processed_alignments/vowels/LJ001-0001.vowels.txt:
0.0700	0.1900	IH1
0.3000	0.4500	IH0
0.7900	0.9400	IH1

./processed_alignments/syllables_heuristic/LJ001-0001.syllables.txt:
0.0400	0.2200	IH1
0.2200	0.6800	IH0
0.4500	1.0000	IH1
---------------------------------------------


In [None]:
!pip install numpy scipy librosa tqdm



In [None]:
!pip install resampy



In [None]:
# Cell 1: Data Loader Function
import numpy as np
import os

def load_data_from_txt(base_filename, word_dir, syllable_dir, vowel_dir):
    """
    Loads word, syllable (approx), and vowel timing data from .txt files
    for a given base filename and returns them as NumPy arrays structured
    similarly to how the original script expected .mat files.
    """
    word_file = os.path.join(word_dir, f"{base_filename}.words.txt")
    syllable_file = os.path.join(syllable_dir, f"{base_filename}.syllables.txt")
    vowel_file = os.path.join(vowel_dir, f"{base_filename}.vowels.txt")

    word_times_list = []
    words_list = []
    syl_times_list = []
    vowel_start_list = []
    vowel_end_list = []

    try:
        # Load Words
        if os.path.exists(word_file):
            with open(word_file, 'r', encoding='utf-8') as f:
                for line in f:
                    parts = line.strip().split('\t')
                    if len(parts) == 3:
                        start, end, word = parts
                        word_times_list.append([float(start), float(end)])
                        # Encode word as bytes, similar to dtype='S32' expectation
                        words_list.append(word.encode('utf-8'))
                    # else:
                         # print(f"Warning: Skipping malformed line in {word_file}: {line.strip()}") # Suppressed for cleaner parallel output
        else:
             # print(f"Warning: Word file not found for {base_filename}") # Suppressed
             pass


        # Load Syllables (Approximated by Vowels)
        if os.path.exists(syllable_file):
            with open(syllable_file, 'r', encoding='utf-8') as f:
                for line in f:
                    parts = line.strip().split('\t')
                    if len(parts) == 3:
                        start, end, _ = parts # Don't need the vowel label here for timing
                        syl_times_list.append([float(start), float(end)])
                    # else:
                         # print(f"Warning: Skipping malformed line in {syllable_file}: {line.strip()}") # Suppressed
        else:
            # print(f"Warning: Syllable file not found for {base_filename}") # Suppressed
             pass


        # Load Vowels
        if os.path.exists(vowel_file):
            with open(vowel_file, 'r', encoding='utf-8') as f:
                for line in f:
                    parts = line.strip().split('\t')
                    if len(parts) == 3:
                        start, end, _ = parts # Don't need the vowel label here
                        vowel_start_list.append(float(start))
                        vowel_end_list.append(float(end))
                    # else:
                        # print(f"Warning: Skipping malformed line in {vowel_file}: {line.strip()}") # Suppressed
        else:
            # print(f"Warning: Vowel file not found for {base_filename}") # Suppressed
            pass


        # Convert lists to NumPy arrays with expected shapes/types
        word_times_array = np.array(word_times_list, dtype=float)
        words_array = np.array(words_list, dtype='S32').reshape(-1, 1) # Use S32 as in original
        syl_times_array = np.array(syl_times_list, dtype=float)
        vowel_start_array = np.array(vowel_start_list, dtype=float).reshape(1, -1)
        vowel_end_array = np.array(vowel_end_list, dtype=float).reshape(1, -1)


        # Basic validation: Check if essential arrays are non-empty
        if word_times_array.size == 0 or syl_times_array.size == 0 or vowel_start_array.size == 0:
             # print(f"Warning: Missing or empty required data files for {base_filename}") # Suppressed
             return None # Indicate failure to load essential data

        return {
            "word_times": word_times_array,
            "words": words_array,
            "syl_times": syl_times_array,
            "vowel_start": vowel_start_array,
            "vowel_end": vowel_end_array
        }

    except Exception as e:
        print(f"Error loading data for {base_filename}: {e}")
        return None

In [None]:
# Cell 2: Feature Computation Functions
import numpy
import numpy as np
import wave
import struct
import math
from scipy.signal import butter, lfilter
from scipy.stats import norm
from scipy.stats.mstats import gmean
import librosa # Make sure librosa is imported
from scipy.signal import medfilt # Make sure medfilt is imported

# --- Paste content of myfunctions.py ---
# (Included directly here for simplicity in Colab)
# [Same code as provided in the previous answer for these functions]
# ... (ensure all functions from spectral_selection to vocoder_func are here) ...
def spectral_selection(x, n):
    y = x.shape
    row = y[0]
    col = y[1]
    xx = []
    for i in range(0,col,1):
        v = x[:,i]
        v = numpy.array([v])
        v = v.T
        t = numpy.array(numpy.arange(1,row+1)).reshape(-1,1)
        v = numpy.hstack((v, t))
        v_sort = v[v[:,0].argsort(),]
        v_sort_sel = v_sort[row-n:row, :]
        vv = v_sort_sel[v_sort_sel[:,1].argsort(),]
        #tt = numpy.array([vv[:,0]])
        if i!=0:
            if i==1:
                pp = numpy.array([xx])
                pp = pp.T
            else:
                pp = xx
            pp2 = numpy.array([vv[:,0]])
            pp2 = pp2.T
            xx = numpy.hstack((pp, pp2))
        else:
            # Corrected concatenation for the first iteration
            xx = vv[:,0]
    # Ensure xx is returned as a 2D array [n_selected_bands, n_frames]
    if len(xx.shape) == 1 and col > 0 : # Handle case where only one frame exists after selection
        xx = xx.reshape(-1, 1)
    elif col == 0: # Handle empty input case
        return np.array([]).reshape(n, 0)
    return xx

def temp_vec_corr(x2, t_sigma):
    y = x2.shape
    row = y[0]
    col = y[1]
    # Handle edge case where col is 0 or 1
    if col <= 1:
       # Return zeros or handle as appropriate, depending on desired behavior for single/no frames
       if col == 1:
           # Maybe return the vector itself or its magnitude? Returning zeros for now.
            return np.zeros(row)
       else: # col == 0
           return np.zeros(row) # Or return an empty array

    wn = norm.pdf(np.arange(1,col+1,1), (col+1)/2, t_sigma)
    x3 = np.zeros((row,col))
    for i in range(0,row,1):
        x3[i,:] = np.multiply(x2[i,:],wn)
    s=0.0 # Initialize s as float
    num_pairs = (col * (col - 1)) / 2
    for i in range(0,col-1,1):
        for j in range(i+1,col,1):
            # Ensure dimensions match for multiplication if needed, though here it should be element-wise
             s += np.multiply(x3[:,i], x3[:,j]) # Summing element-wise products

    # Avoid division by zero if num_pairs is 0
    if num_pairs > 0:
        s = np.sqrt(np.divide(s, num_pairs))
    else:
        s = np.zeros(row) # Handle case with < 2 columns

    # Handle potential NaNs resulting from sqrt of negative (though unlikely here)
    s = np.nan_to_num(s)
    return s


def temporal_corr(x, win, t_sigma):
    hwin = (win-1)//2
    yy = x.shape
    row = yy[0]
    col = yy[1]

    # Handle empty input gracefully
    if col == 0:
        return np.array([]).reshape(row, 0)

    # Pad the input array
    # Ensure padding uses compatible dtypes, e.g., float
    x_padded = np.hstack((np.zeros((row,hwin), dtype=x.dtype), x, np.zeros((row, hwin), dtype=x.dtype)))

    y = []
    for i in range(hwin, col + hwin):
        temp2 = x_padded[:, i - hwin : i + hwin + 1]
        z = temp_vec_corr(temp2, t_sigma)
        # z will be a 1D array of size 'row'
        y.append(z)

    # Stack the results correctly
    if y:
        y = np.vstack(y).T # Transpose to get shape [row, col]
    else:
        y = np.array([]).reshape(row, 0) # Return empty array with correct row dimension

    return y


def spectral_corr(x):
    yy = x.shape
    row = yy[0]
    col = yy[1]

    # Handle cases with 0 or 1 spectral bands
    if row <= 1:
        if row == 1:
            return x # Return the single band as is
        else: # row == 0
            return np.array([]).reshape(0, col) # Return empty array with correct col dimension

    s = np.zeros(col, dtype=float) # Initialize s for the column dimension
    num_pairs = (row * (row - 1)) / 2

    for i in range(0, row-1, 1):
        for j in range(i+1, row, 1):
            s += np.multiply(x[i,:], x[j,:]) # Element-wise multiplication and accumulation

    # Avoid division by zero
    if num_pairs > 0:
         s = np.sqrt(np.divide(s, num_pairs))
    else:
         s = np.zeros(col) # Should not happen if row > 1

    s = np.nan_to_num(s) # Handle potential NaNs
    # Return as a 1xCol array to match expected usage downstream
    return s.reshape(1, col)

def statFunctions_Syl(t):
    # Input t is expected to be 1xN
    if t.size == 0: return np.zeros((8, 1)) # Return default/empty feature vector

    t_1d = t.flatten() # Work with 1D array
    # Ensure non-negative values for gmean
    min_val = np.min(t_1d)
    if min_val < 0:
        t_1d = t_1d - min_val

    # Handle potential zero values before gmean if necessary, or check gmean behavior
    # Adding a small epsilon if zeros are present and problematic for gmean
    if np.any(t_1d <= 0):
        t_1d = t_1d + 1e-9 # Add small epsilon to avoid log(0) in gmean calculation

    # Calculate basic stats
    median_val = np.median(t_1d)
    mean_val = np.mean(t_1d)
    try:
        gmean_val = gmean(np.absolute(t_1d)) # Use absolute for gmean just in case
    except ValueError:
        gmean_val = 0 # Handle case where gmean fails (e.g., all zeros after adjustment)
    range_val = np.max(t_1d) - np.min(t_1d) # Min should be >= 0 now
    std_val = np.std(t_1d)

    out = np.array([median_val, mean_val, gmean_val, range_val, std_val])

    # Temporal moments calculation
    # Normalize t for probability distribution calculation (ensure sum is not zero)
    sum_t = np.sum(t_1d)
    if sum_t <= 1e-9: # Avoid division by zero
        temporalStd = 0
        temporalSkewness = 0
        temporalKurthosis = 0
    else:
        t_norm = t_1d / sum_t
        indices = np.arange(1, len(t_norm) + 1)
        temporalMean = np.sum(indices * t_norm)
        temporalVar = np.sum(((indices - temporalMean) ** 2) * t_norm)
        temporalStd = np.sqrt(temporalVar) if temporalVar >= 0 else 0 # Ensure non-negative variance

        if temporalStd > 1e-9: # Avoid division by zero for skewness/kurtosis
            temporalSkewness = np.sum(((indices - temporalMean) ** 3) * t_norm) / (temporalStd ** 3)
            temporalKurthosis = np.sum(((indices - temporalMean) ** 4) * t_norm) / (temporalStd ** 4)
        else:
            temporalSkewness = 0
            temporalKurthosis = 0 # Or some default like 3 for normal kurtosis? Setting to 0.

    arr1 = np.array([temporalStd, temporalSkewness, temporalKurthosis])
    out = np.concatenate((out, arr1))
    return out.reshape(-1, 1) # Ensure output is 8x1 column vector


def statFunctions_Vwl(t):
     # Input t is expected to be 1xN
    if t.size == 0: return np.zeros((7, 1)) # 7 features for Vowel

    t_1d = t.flatten()
    min_val = np.min(t_1d)
    if min_val < 0:
        t_1d = t_1d - min_val

    # Calculate basic stats
    median_val = np.median(t_1d)
    mean_val = np.mean(t_1d)
    range_val = np.max(t_1d) - np.min(t_1d) # Min should be >= 0 now
    std_val = np.std(t_1d)

    out = np.array([median_val, mean_val, range_val, std_val])

    # Temporal moments calculation
    sum_t = np.sum(t_1d)
    if sum_t <= 1e-9: # Avoid division by zero
        temporalStd = 0
        temporalSkewness = 0
        temporalKurthosis = 0
    else:
        t_norm = t_1d / sum_t
        indices = np.arange(1, len(t_norm) + 1)
        temporalMean = np.sum(indices * t_norm)
        temporalVar = np.sum(((indices - temporalMean) ** 2) * t_norm)
        temporalStd = np.sqrt(temporalVar) if temporalVar >= 0 else 0

        if temporalStd > 1e-9:
            temporalSkewness = np.sum(((indices - temporalMean) ** 3) * t_norm) / (temporalStd ** 3)
            temporalKurthosis = np.sum(((indices - temporalMean) ** 4) * t_norm) / (temporalStd ** 4)
        else:
            temporalSkewness = 0
            temporalKurthosis = 0

    arr1 = np.array([temporalStd, temporalSkewness, temporalKurthosis])
    out = np.concatenate((out, arr1))
    return out.reshape(-1, 1) # Ensure output is 7x1 column vector

def smooth(t_cor, swin, sigma):
    # Input t_cor is expected to be 1xN
    if t_cor.size == 0: return np.array([])

    ft = norm.pdf(np.arange(1, swin + 1), (swin + 1) / 2, sigma)
    # Ensure ft is 1D for convolution
    ft_1d = ft.flatten()
    t_cor_1d = t_cor.flatten()

    convRes = np.convolve(t_cor_1d, ft_1d, mode='full') # Use 'full' convolution

    start = int(np.floor((swin - 1) / 2)) # Calculate start index for valid part
    end = start + len(t_cor_1d)       # Calculate end index for valid part

    # Ensure indices are within bounds
    start = max(0, start)
    end = min(len(convRes), end)

    if start >= end: # Handle cases where convolution result is too short
        return np.array([])

    y = convRes[start:end]

    return y.reshape(1, -1) # Return as 1xN array

# Removed get_labels as it wasn't used in feats_copy.py

def vocoder_func(wavPath):

    # FILTER DEFINITIONS
    def butter_bandpass(lowcut, highcut, fs, order):
        nyq = 0.5*fs
        low = float(lowcut) / nyq
        high = float(highcut) / nyq
        # Clamp high to avoid issues if highcut >= nyq
        high = min(high, 0.9999) # Ensure high < 1.0
        # Ensure low < high
        if low >= high:
           low = high * 0.99 # Adjust low if necessary
           # print(f"Warning: Adjusting lowcut frequency in butter_bandpass for {wavPath}") # Suppressed
        if low <= 0:
           low = 1e-6 # Ensure low > 0
           # print(f"Warning: Adjusting lowcut frequency to minimum in butter_bandpass for {wavPath}") # Suppressed

        try:
            b, a = butter(order, [low, high], btype='band')
        except ValueError as e:
            print(f"Butterworth filter error for low={low}, high={high}, fs={fs}, order={order}: {e}")
            # Return identity filter coeffs or handle error appropriately
            return np.array([1]), np.array([1, 0]) # Example: Placeholder identity filter
        return b, a

    def butter_lowpass(lowcut, fs, order):
        nyq = 0.5*fs
        low = float(lowcut) / nyq
        low = max(1e-6, min(low, 0.9999)) # Clamp low frequency
        try:
            b ,a = butter(order, low, btype='lowpass')
        except ValueError as e:
             print(f"Butterworth lowpass filter error for low={low}, fs={fs}, order={order}: {e}")
             return np.array([1]), np.array([1, 0])
        return b, a

    def butter_bandpass_filter(data, lowcut, highcut, fs, order):
        b, a = butter_bandpass(lowcut, highcut, fs, order=order)
        try:
             y = lfilter(b, a, data)
        except ValueError as e:
             print(f"lfilter error during bandpass: {e}. Returning original data.")
             y = data # Return original data if filtering fails
        return y

    def butter_lowpass_filter(data, lowcut, fs, order):
        b, a = butter_lowpass(lowcut, fs, order=order)
        try:
            y = lfilter(b, a, data)
        except ValueError as e:
            print(f"lfilter error during lowpass: {e}. Returning original data.")
            y = data
        return y

    # FUNCTION TO READ A .wav FILE MATLAB STYLE (Modified to use librosa for robustness)
    def readWavLibrosa(wavPath):
        try:
            # Load with librosa, ensure mono, specify sample rate if known/needed
            data, fs = librosa.load(wavPath, sr=None, mono=True)
            length = len(data)
            # Normalize
            max_val = np.max(np.abs(data))
            if max_val > 0:
                data = data / max_val
            else:
                data = np.zeros_like(data) # Handle silent files
            return data, fs, length
        except Exception as e:
            print(f"Error reading WAV {wavPath} with librosa: {e}")
            return np.array([]), 0, 0 # Return empty/default values on error

    # BUFFER FUNCTION AS DEFINED IN MATLAB (Using librosa's framing)
    def frame_audio(x, frame_length, hop_length):
        # librosa.util.frame pads automatically if necessary
        # Ensure x is contiguous
        x = np.ascontiguousarray(x)
        frames = librosa.util.frame(x, frame_length=frame_length, hop_length=hop_length)
        # librosa returns [n_features, n_frames], which matches the desired output shape
        return frames

    fltcF= np.array([240,360,480,600,720,840,1000,1150,1300,1450,1600,1800,2000,2200,2400,2700,3000,3300,3750])
    fltBW= np.array([120,120,120,120,120,120,150,150,150,150,150,200,200,200,200,300,300,300,500])

    fltFc= np.array([np.subtract(fltcF,np.divide(fltBW,2)),np.add(fltcF,np.divide(fltBW,2))])
    fltLpFc= 50

    # sig, Fs, length = readWav(wavPath) # Using original readWav might be problematic
    sig, Fs, length = readWavLibrosa(wavPath) # Use librosa version

    # Handle read errors
    if length == 0 or Fs == 0:
        # print(f"Could not read or empty WAV file: {wavPath}. Returning empty energy.") # Suppressed
        # Return empty array with expected first dimension (19 bands)
        # Original function didn't explicitly handle this, but returning empty is safer
        return np.empty((19, 0)) # Return shape [nBands, nFrames]

    # Saving the audio not needed for feature calculation itself
    # xx = np.append(Fs,sig)

    nWndw = int(round(Fs*0.02))
    nOverlap = int(round(Fs*0.01))
    nHop = nWndw - nOverlap

    # Normalize signal
    max_abs_sig = np.max(np.abs(sig))
    if max_abs_sig > 0:
         sig = 0.99 * sig / max_abs_sig
    else:
         sig = np.zeros_like(sig) # Handle silent signal

    # Windowing first and filtering next (Using librosa framing)
    sigFrames = frame_audio(sig, nWndw, nHop) # Shape [nWndw, nFrames]
    numFrames = sigFrames.shape[1]
    numBands = fltFc.shape[1] # Should be 19
    subBandEnergies= np.zeros([numBands, numFrames])

    for j in range(numFrames): # Iterate through frames
        currFrame = sigFrames[:,j] # Get the j-th frame
        for i in range(numBands): # Iterate through filter bands
            # Filter the current frame
            fltFrame = butter_bandpass_filter(currFrame, fltFc[0, i], fltFc[1, i], Fs, 2)
            rectFrame = np.abs(fltFrame)
            lpFltFrame = butter_lowpass_filter(rectFrame, float(fltLpFc), Fs, 2)
            # Energy is the last sample of the low-pass filtered frame
            # Ensure index is valid
            if len(lpFltFrame) > 0:
                 currEnergy = lpFltFrame[-1] # Take the last sample
            else:
                 currEnergy = 0 # Handle empty filter output

            # Apply floor and log transformation as in original code
            if currEnergy < 1:
                currEnergy = 0.5
            # Original MATLAB: subBandEnergies(i,j) = exp(2*log(currEnergy)/log(10));
            subBandEnergies[i,j] = math.exp(2*math.log(currEnergy)/math.log(10)) # Base e log

    # Return energies in the shape [nBands, nFrames] as expected by compute_prosody_features_single
    return subBandEnergies # Shape [19, nFrames]

# --- Modified Feature Computation Function ---

def compute_prosody_features_single(wavFile, words_array, word_times_array, syl_times_array, vowel_start_array, vowel_end_array):
    """
    Computes prosody features for a single audio file using pre-loaded boundary info.
    Adapted from the original feats_copy.py script.
    """
    # Compute features              ########################################################################################
    twin = 5
    t_sigma = 1.4
    swin = 7
    s_sigma = 1.5

    vwlSB_num= 4
    vowelSB= [1,2,4,5,6,7,8,13,14,15,16,17] # Indices are 1-based
    sylSB_num= 5
    sylSB= [1,2,3,4,5,6,13,14,15,16,17,18] # Indices are 1-based

    if syl_times_array is None or vowel_start_array is None or \
       syl_times_array.size == 0 or vowel_start_array.size == 0:
        # print(f"Error: Syllable or Vowel timing data is empty for {wavFile}. Cannot compute features.") # Suppressed
        return None # Indicate failure

    eng_full = vocoder_func(wavFile) # Removed xx return
    # Handle vocoder failure
    if eng_full is None or eng_full.size == 0:
        # print(f"Error: Vocoder failed or returned empty for {wavFile}. Cannot compute features.") # Suppressed
        return None

    # --- Processing timing info ---
    # Syllable times
    spurtStartTime = syl_times_array[:, 0]
    spurtEndTime = syl_times_array[:, 1]

    if len(spurtStartTime) == 0:
        # print(f"Error: No syllable start times available for {wavFile}.") # Suppressed
        return None

    # Reference start time
    ref_start_time = spurtStartTime[0]

    # Syllable frames
    spurtStartFrame = np.round((spurtStartTime - ref_start_time) * 100).astype(int)
    spurtEndFrame = np.round((spurtEndTime - ref_start_time) * 100).astype(int)

    # Vowel frames
    vowelStartFrame = np.round((vowel_start_array.flatten() - ref_start_time) * 100).astype(int)
    vowelEndFrame = np.round((vowel_end_array.flatten() - ref_start_time) * 100).astype(int)

     # Word times (handle potential None or empty)
    if word_times_array is not None and word_times_array.size > 0:
        startWordTime = word_times_array[:, 0]
        endWordTime = word_times_array[:, 1]
        startWordFrame = np.round((startWordTime - ref_start_time) * 100).astype(int)
        endWordFrame = np.round((endWordTime - ref_start_time) * 100).astype(int)
        num_words = len(startWordTime)
    else:
        # If no word info, create dummy boundaries covering the whole file
        # This allows feature calculation but word-based normalization won't work correctly
        # print(f"Warning: Word timing data missing for {wavFile}. Using file boundaries.") # Suppressed
        num_frames_total = eng_full.shape[1]
        startWordFrame = np.array([0])
        endWordFrame = np.array([num_frames_total])
        num_words = 1


    # --- TCSSBC computation ---
    # (Code is the same as previous version, using eng_full directly)
    # Syllable TCSSBC
    sylSB_indices = np.array(sylSB) - 1 # Convert to 0-based index
    sylSB_indices = sylSB_indices[sylSB_indices < eng_full.shape[0]]
    if len(sylSB_indices) == 0:
        # print(f"Warning: No valid syllable bands selected for {wavFile}") # Suppressed
        syl_eng_selected = np.zeros((0, eng_full.shape[1])) # Handle empty selection
    else:
        syl_eng_bands = eng_full[sylSB_indices, :]
        if syl_eng_bands.shape[0] > sylSB_num:
            syl_eng_selected = spectral_selection(syl_eng_bands, sylSB_num)
        else:
            syl_eng_selected = syl_eng_bands

    syl_t_cor = temporal_corr(syl_eng_selected, int(twin), t_sigma)
    syl_s_cor = spectral_corr(syl_t_cor)
    sylTCSSBC_raw = smooth(syl_s_cor, swin, s_sigma) # Shape [1, nFrames]
    sylTCSSBC = sylTCSSBC_raw
    max_syl_val = np.max(sylTCSSBC)
    if max_syl_val > 1e-9:
        sylTCSSBC = sylTCSSBC / max_syl_val
    else:
        sylTCSSBC = np.zeros_like(sylTCSSBC)

    # Vowel TCSSBC
    vwlSB_indices = np.array(vowelSB) - 1 # 0-based index
    vwlSB_indices = vwlSB_indices[vwlSB_indices < eng_full.shape[0]]
    if len(vwlSB_indices) == 0:
        #  print(f"Warning: No valid vowel bands selected for {wavFile}") # Suppressed
         vwl_eng_selected = np.zeros((0, eng_full.shape[1]))
    else:
        vwl_eng_bands = eng_full[vwlSB_indices, :]
        if vwl_eng_bands.shape[0] > vwlSB_num:
            vwl_eng_selected = spectral_selection(vwl_eng_bands, vwlSB_num)
        else:
            vwl_eng_selected = vwl_eng_bands

    vwl_t_cor = temporal_corr(vwl_eng_selected, int(twin), t_sigma)
    vwl_s_cor = spectral_corr(vwl_t_cor)
    vwlTCSSBC_raw = smooth(vwl_s_cor, swin, s_sigma) # Shape [1, nFrames]
    vwlTCSSBC = vwlTCSSBC_raw
    max_vwl_val = np.max(vwlTCSSBC)
    if max_vwl_val > 1e-9:
        vwlTCSSBC = vwlTCSSBC / max_vwl_val
    else:
        vwlTCSSBC = np.zeros_like(vwlTCSSBC)

    # --- Compute statistics ---
    if sylTCSSBC.size == 0 or vwlTCSSBC.size == 0:
        # print(f"Error: TCSSBC contours are empty for {wavFile}. Cannot compute features.") # Suppressed
        return None

    num_frames_syl = sylTCSSBC.shape[1]
    num_frames_vwl = vwlTCSSBC.shape[1]
    num_frames = min(num_frames_syl, num_frames_vwl)
    if num_frames == 0: # If contours ended up empty after processing
        # print(f"Error: Zero frames in TCSSBC contours for {wavFile}.") # Suppressed
        return None

    word_duration = np.zeros(num_words) # 1D array
    word_Sylsum = np.zeros(num_words)
    word_Vwlsum = np.zeros(num_words)

    # Preprocessing: Median filtering within word boundaries
    for j in range(num_words):
        temp_start = max(0, startWordFrame[j])
        temp_end = min(num_frames, endWordFrame[j])
        if temp_start >= temp_end: continue

        word_duration[j] = temp_end - temp_start

        syl_segment = sylTCSSBC[0, temp_start:temp_end]
        if len(syl_segment) >= 3:
            syl_segment_filtered = medfilt(syl_segment, 3)
            syl_segment_filtered[0] = syl_segment_filtered[1] if len(syl_segment_filtered) > 1 else syl_segment_filtered[0]
            syl_segment_filtered[-1] = syl_segment_filtered[-2] if len(syl_segment_filtered) > 1 else syl_segment_filtered[-1]
            sylTCSSBC[0, temp_start:temp_end] = syl_segment_filtered
        elif len(syl_segment) > 0:
             sylTCSSBC[0, temp_start:temp_end] = syl_segment

        vwl_segment = vwlTCSSBC[0, temp_start:temp_end]
        if len(vwl_segment) >= 3:
            vwl_segment_filtered = medfilt(vwl_segment, 3)
            vwl_segment_filtered[0] = vwl_segment_filtered[1] if len(vwl_segment_filtered) > 1 else vwl_segment_filtered[0]
            vwl_segment_filtered[-1] = vwl_segment_filtered[-2] if len(vwl_segment_filtered) > 1 else vwl_segment_filtered[-1]
            vwlTCSSBC[0, temp_start:temp_end] = vwl_segment_filtered
        elif len(vwl_segment) > 0:
            vwlTCSSBC[0, temp_start:temp_end] = vwl_segment

        word_Sylsum[j] = np.sum(sylTCSSBC[0, temp_start:temp_end])
        word_Vwlsum[j] = np.sum(vwlTCSSBC[0, temp_start:temp_end])

    sylTCSSBC = np.nan_to_num(sylTCSSBC)
    vwlTCSSBC = np.nan_to_num(vwlTCSSBC)

    # --- Generating the features per syllable ---
    num_syllables = len(spurtStartFrame)
    tempOut_list = []

    # Map syllables to words (handle case where word boundaries weren't loaded)
    wordIndication = np.zeros(num_syllables, dtype=int)
    if num_words > 1 or (num_words == 1 and startWordFrame[0] != 0): # Check if real word boundaries exist
        for j in range(num_syllables):
            possible_words = np.where(startWordFrame <= spurtStartFrame[j])[0]
            if len(possible_words) > 0:
                word_ind = possible_words[-1]
            else:
                word_ind = -1
                # print(f"Warning: Syllable {j} starts before any word for {wavFile}") # Suppressed
            wordIndication[j] = word_ind
    # else: wordIndication remains all zeros (all syllables assigned to the single 'file boundary' word)


    for j in range(num_syllables):
        word_ind = wordIndication[j]
        if word_ind == -1: continue

        syl_start_f = max(0, spurtStartFrame[j])
        syl_end_f = min(num_frames, spurtEndFrame[j])
        if syl_start_f >= syl_end_f: continue

        currFtr1SylSeg = sylTCSSBC[0, syl_start_f:syl_end_f].reshape(1, -1)
        syl_seg_len = currFtr1SylSeg.shape[1]
        if syl_seg_len == 0: continue

        target_len_syl = 30
        if syl_seg_len == target_len_syl:
            arrResampledSyl = currFtr1SylSeg
        else:
             resampledSyl = librosa.resample(currFtr1SylSeg.flatten(), orig_sr=syl_seg_len, target_sr=target_len_syl, res_type='kaiser_best') # Specify res_type
             arrResampledSyl = resampledSyl.reshape(1, -1)

        currSylFtrs = statFunctions_Syl(arrResampledSyl)

        current_syl_sum = np.sum(currFtr1SylSeg)
        word_syl_sum_for_norm = word_Sylsum[word_ind]
        syl_sum_norm = current_syl_sum / word_syl_sum_for_norm if word_syl_sum_for_norm > 1e-9 else 0
        arr1_syl = np.array([[syl_sum_norm]])
        currSylFtrs = np.vstack((currSylFtrs, arr1_syl)) # 9x1

        # Ensure index j is valid for vowel arrays
        if j >= len(vowelStartFrame) or j >= len(vowelEndFrame):
             # print(f"Warning: Syllable index {j} exceeds vowel array length for {wavFile}. Using defaults.") # Suppressed
             currVowelFtrs = np.zeros((7+1, 1)) # Default vowel features
             vowel_seg_len_actual = 0
        else:
            vwl_start_f = max(0, vowelStartFrame[j])
            vwl_end_f = min(num_frames, vowelEndFrame[j])

            if vwl_start_f >= vwl_end_f:
                # print(f"Skipping empty vowel segment for syllable {j} in {wavFile}") # Suppressed
                currVowelFtrs = np.zeros((7+1, 1))
                vowel_seg_len_actual = 0
            else:
                currFtr1VowelSeg = vwlTCSSBC[0, vwl_start_f:vwl_end_f].reshape(1, -1)
                vowel_seg_len_actual = currFtr1VowelSeg.shape[1]
                if vowel_seg_len_actual == 0:
                    currVowelFtrs = np.zeros((7+1, 1))
                else:
                    target_len_vwl = 20
                    if vowel_seg_len_actual == target_len_vwl:
                        arrResampledVwl = currFtr1VowelSeg
                    else:
                        resampledVwl = librosa.resample(currFtr1VowelSeg.flatten(), orig_sr=vowel_seg_len_actual, target_sr=target_len_vwl, res_type='kaiser_best') # Specify res_type
                        arrResampledVwl = resampledVwl.reshape(1, -1)

                    currVowelFtrs = statFunctions_Vwl(arrResampledVwl)

                    current_vwl_sum = np.sum(currFtr1VowelSeg)
                    word_vwl_sum_for_norm = word_Vwlsum[word_ind]
                    vwl_sum_norm = current_vwl_sum / word_vwl_sum_for_norm if word_vwl_sum_for_norm > 1e-9 else 0
                    arr1_vwl = np.array([[vwl_sum_norm]])
                    currVowelFtrs = np.vstack((currVowelFtrs, arr1_vwl)) # 8x1

        syl_len_actual = currFtr1SylSeg.shape[1]
        feature_col = np.vstack((currSylFtrs, currVowelFtrs, [[vowel_seg_len_actual]], [[syl_len_actual]]))
        tempOut_list.append(feature_col)

    if not tempOut_list:
        # print(f"Warning: No features generated for {wavFile}") # Suppressed
        return np.array([]).reshape(19, 0)

    ftrs = np.hstack(tempOut_list) # Shape [19, nSyllables]

    # --- Post-processing: Normalize lengths within words ---
    unique_word_indices = np.unique(wordIndication)
    unique_word_indices = unique_word_indices[unique_word_indices >= 0]

    for iterWrd_idx in unique_word_indices:
        syllable_indices_in_word = np.where(wordIndication == iterWrd_idx)[0]
        if len(syllable_indices_in_word) > 1 :
            vowel_lengths_in_word = ftrs[-2, syllable_indices_in_word]
            syllable_lengths_in_word = ftrs[-1, syllable_indices_in_word]

            sum_vowel_len = np.sum(vowel_lengths_in_word)
            if sum_vowel_len > 0:
                ftrs[-2, syllable_indices_in_word] = vowel_lengths_in_word / sum_vowel_len
            else:
                 ftrs[-2, syllable_indices_in_word] = 0

            sum_syllable_len = np.sum(syllable_lengths_in_word)
            if sum_syllable_len > 0:
                 ftrs[-1, syllable_indices_in_word] = syllable_lengths_in_word / sum_syllable_len
            else:
                 ftrs[-1, syllable_indices_in_word] = 0

    # print(f"Computed features for {wavFile}, shape: {ftrs.shape}") # Suppressed for cleaner parallel output
    return ftrs

# --- End of Modified Feature Computation ---

In [None]:
# Cell 3: Main Batch Processing Script with Multiprocessing
import glob
import time
import os
import numpy as np
from tqdm.auto import tqdm # Use tqdm for progress bar
import concurrent.futures # For multiprocessing
import functools # For partial function application

# Make sure the functions from Cell 1 and Cell 2 are accessible
# (They should be if you run the cells in order in Colab)

# --- Configuration ---
NUM_FILES_TO_PROCESS = 6000 # <<<<<<< LIMIT ADDED HERE
NUM_WORKERS = 2            # <<<<<<< Number of parallel processes (adjust as needed, e.g., 2 or 4)

# Directories
wav_dir = "./wav"
word_dir = "./processed_alignments/words"
syllable_dir = "./processed_alignments/syllables_approx" # Using the approximated syllables
vowel_dir = "./processed_alignments/vowels"

# Output file to save all computed features
output_feature_file = f"./lj_speech_prosody_features_{NUM_FILES_TO_PROCESS}.npz" # Adjusted output filename

# --- Worker Function for Multiprocessing ---
def process_single_file(wav_filepath, word_dir, syllable_dir, vowel_dir):
    """
    Loads data and computes features for a single wav file.
    Designed to be called by the multiprocessing pool.
    Returns (base_filename, features) or (base_filename, None) on error.
    """
    base_filename = os.path.splitext(os.path.basename(wav_filepath))[0]
    try:
        # 1. Load timing data (Calls function from Cell 1)
        timing_data = load_data_from_txt(base_filename, word_dir, syllable_dir, vowel_dir)

        if timing_data is None:
            return base_filename, None

        # 2. Compute features (Calls function from Cell 2)
        features = compute_prosody_features_single(
            wav_filepath,
            timing_data["words"],
            timing_data["word_times"],
            timing_data["syl_times"],
            timing_data["vowel_start"],
            timing_data["vowel_end"]
        )

        # 3. Return results
        if features is not None and features.size > 0:
            return base_filename, features
        else:
            return base_filename, None

    except Exception as e:
        # Print critical errors even during parallel execution
        print(f"!! Critical Error processing {base_filename}: {e}")
        # import traceback # Optional: print detailed traceback if needed
        # traceback.print_exc()
        return base_filename, None

# --- Main Execution Logic ---
all_computed_features = {}
start_time = time.time()

# Find all WAV files and limit the count
all_wav_files = sorted(glob.glob(os.path.join(wav_dir, "*.wav")))
wav_files_to_process = all_wav_files[:NUM_FILES_TO_PROCESS] # <<<<<<< LIMITING HERE

print(f"Found {len(all_wav_files)} total WAV files. Processing first {len(wav_files_to_process)}.")
print(f"Using {NUM_WORKERS} parallel workers.")

if not wav_files_to_process:
    print("Error: No WAV files found or selected in specified directory. Exiting.")
else:
    processed_count = 0
    error_count = 0

    # Create a partial function with fixed directory arguments for the worker
    worker_func = functools.partial(process_single_file,
                                    word_dir=word_dir,
                                    syllable_dir=syllable_dir,
                                    vowel_dir=vowel_dir)

    # Use ProcessPoolExecutor for parallel processing
    with concurrent.futures.ProcessPoolExecutor(max_workers=NUM_WORKERS) as executor:
        # Use executor.map to apply the worker function to each file path
        # Wrap the iterator with tqdm for progress tracking
        results = list(tqdm(executor.map(worker_func, wav_files_to_process),
                           total=len(wav_files_to_process),
                           desc="Computing prosody features"))

    # Process results from the parallel execution
    print("\nCollecting results...")
    for base_filename, features in results:
        if features is not None:
            all_computed_features[base_filename] = features
            processed_count += 1
        else:
            error_count += 1

    # --- Save All Features ---
    end_time = time.time()
    print("\n--- Processing Summary ---")
    print(f"Successfully processed: {processed_count} files")
    print(f"Failed/Skipped:       {error_count} files")
    print(f"Total time:           {end_time - start_time:.2f} seconds ({((end_time - start_time)/processed_count if processed_count > 0 else 0):.2f} s/file avg)")

    if all_computed_features:
        print(f"\nSaving computed features for {len(all_computed_features)} files to {output_feature_file}...")
        np.savez_compressed(output_feature_file, **all_computed_features)
        print("Features saved successfully.")
    else:
        print("\nNo features were successfully computed to save.")

Found 13100 total WAV files. Processing first 6000.
Using 2 parallel workers.


Computing prosody features:   0%|          | 0/6000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
print("Zipping required folders... This may take a while depending on size.")
!zip -q -r ./data_to_download.zip ./wav ./processed_alignments
print("Zipping complete: data_to_download.zip created.")

# Optional: Check the size of the created zip file
!du -sh ./data_to_download.zip

Zipping required folders... This may take a while depending on size.
Zipping complete: data_to_download.zip created.
2.3G	./data_to_download.zip


In [None]:
from google.colab import files

print("Preparing download...")
# Trigger the browser download for the zip file
files.download('./data_to_download.zip')

Preparing download...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import os
import glob
import shutil
from tqdm.auto import tqdm

# --- Configuration ---
NUM_FILES_TO_COPY = 6000

# Source directories
source_wav_dir = "./wav"
source_processed_dir = "./processed_alignments"
source_word_dir = os.path.join(source_processed_dir, "words")
source_vowel_dir = os.path.join(source_processed_dir, "vowels")
source_syllable_dir = os.path.join(source_processed_dir, "syllables_approx")

# Target directories (new names)
target_wav_dir = "./wav2"
target_processed_dir = "./processed_alignments2"
target_word_dir = os.path.join(target_processed_dir, "words")
target_vowel_dir = os.path.join(target_processed_dir, "vowels")
target_syllable_dir = os.path.join(target_processed_dir, "syllables_approx")

# --- Steps ---

# 1. Create target directories
print("Creating target directories...")
os.makedirs(target_wav_dir, exist_ok=True)
os.makedirs(target_word_dir, exist_ok=True) # Creates processed_alignments2 implicitly
os.makedirs(target_vowel_dir, exist_ok=True)
os.makedirs(target_syllable_dir, exist_ok=True)
print("Target directories created.")

# 2. Get the list of the first N wav files (ensure consistent sorting)
all_wav_files_sorted = sorted(glob.glob(os.path.join(source_wav_dir, "*.wav")))
files_to_copy = all_wav_files_sorted[:NUM_FILES_TO_COPY]

print(f"Identified {len(files_to_copy)} files to copy.")

# 3. Iterate and copy files
processed_count = 0
error_count = 0
print(f"Starting copy process for {len(files_to_copy)} samples...")

for wav_filepath in tqdm(files_to_copy, desc="Copying files"):
    base_filename = os.path.splitext(os.path.basename(wav_filepath))[0]
    copied_something_for_this_base = False

    try:
        # --- Copy WAV file ---
        target_wav_path = os.path.join(target_wav_dir, f"{base_filename}.wav")
        if os.path.exists(wav_filepath):
            shutil.copy2(wav_filepath, target_wav_path) # copy2 preserves metadata
            copied_something_for_this_base = True
        else:
            print(f"Warning: Source WAV not found: {wav_filepath}")
            error_count += 1
            continue # Skip to next base filename if source WAV is missing

        # --- Copy Word file ---
        source_word_path = os.path.join(source_word_dir, f"{base_filename}.words.txt")
        target_word_path = os.path.join(target_word_dir, f"{base_filename}.words.txt")
        if os.path.exists(source_word_path):
            shutil.copy2(source_word_path, target_word_path)
        else:
            print(f"Warning: Source word file not found: {source_word_path}")
            # Continue copying other files for this base if they exist

        # --- Copy Vowel file ---
        source_vowel_path = os.path.join(source_vowel_dir, f"{base_filename}.vowels.txt")
        target_vowel_path = os.path.join(target_vowel_dir, f"{base_filename}.vowels.txt")
        if os.path.exists(source_vowel_path):
            shutil.copy2(source_vowel_path, target_vowel_path)
        else:
            print(f"Warning: Source vowel file not found: {source_vowel_path}")

        # --- Copy Syllable file ---
        source_syllable_path = os.path.join(source_syllable_dir, f"{base_filename}.syllables.txt")
        target_syllable_path = os.path.join(target_syllable_dir, f"{base_filename}.syllables.txt")
        if os.path.exists(source_syllable_path):
            shutil.copy2(source_syllable_path, target_syllable_path)
        else:
             print(f"Warning: Source syllable file not found: {source_syllable_path}")

        if copied_something_for_this_base:
             processed_count += 1

    except Exception as e:
        print(f"!! Error copying files for {base_filename}: {e}")
        error_count += 1


print("\n--- Copy Summary ---")
print(f"Successfully processed (copied WAV for): {processed_count} base filenames")
print(f"Errors or missing source WAV files:      {error_count}")
print(f"\nNew folders created:")
print(f"- {target_wav_dir}")
print(f"- {target_processed_dir}")

Creating target directories...
Target directories created.
Identified 6000 files to copy.
Starting copy process for 6000 samples...


Copying files:   0%|          | 0/6000 [00:00<?, ?it/s]


--- Copy Summary ---
Successfully processed (copied WAV for): 6000 base filenames
Errors or missing source WAV files:      0

New folders created:
- ./wav2
- ./processed_alignments2


In [None]:
print("Zipping required folders... This may take a while depending on size.")
!zip -q -r ./data_to_download2.zip ./wav2 ./processed_alignments2
print("Zipping complete: data_to_download.zip created.")

# Optional: Check the size of the created zip file
!du -sh ./data_to_download2.zip

Zipping required folders... This may take a while depending on size.
Zipping complete: data_to_download.zip created.
1.1G	./data_to_download2.zip


In [None]:
from google.colab import files

print("Preparing download...")
# Trigger the browser download for the zip file
files.download('./data_to_download2.zip')

Preparing download...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import os
import csv
import glob
from tqdm.auto import tqdm # For progress bar

# Define the directory containing the individual transcript (.txt) files
# This should be the same directory as your processed .wav files
source_transcript_dir = "./wav" # Or "./wav2" if you want to use the smaller set

# Define the new directory and the output CSV filename
output_dir = "./transcriptions"
output_csv_path = os.path.join(output_dir, "transcriptions_with_filenames.csv") # Modified filename

# 1. Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
print(f"Ensured output directory exists: {output_dir}")

# 2. Find all the .txt files in the source directory
# We sort them to ensure a consistent order (LJSpeech filenames sort naturally)
transcript_files = sorted(glob.glob(os.path.join(source_transcript_dir, "*.txt")))

if not transcript_files:
    print(f"ERROR: No .txt files found in '{source_transcript_dir}'. Please check the path.")
else:
    print(f"Found {len(transcript_files)} transcript files in '{source_transcript_dir}'.")

    # 3. Open the CSV file for writing
    # 'newline=""' prevents extra blank rows in the CSV on some systems
    # 'encoding='utf-8'' is good practice for text files
    with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        # 4. Create a CSV writer object
        csv_writer = csv.writer(csvfile)

        # 5. Write the header row (MODIFIED: added "filename")
        csv_writer.writerow(["index", "filename", "transcription"])

        # 6. Iterate through the sorted transcript files
        # Use enumerate to get a 1-based index easily
        for idx, txt_filepath in enumerate(tqdm(transcript_files, desc="Writing CSV"), start=1):
            try:
                # --- ADDED: Extract the base filename (like LJ001-0001) ---
                base_filename = os.path.splitext(os.path.basename(txt_filepath))[0]
                # --- End ADDED ---

                # Read the transcription content from the .txt file
                with open(txt_filepath, 'r', encoding='utf-8') as f_txt:
                    transcription_text = f_txt.read().strip() # Read and remove leading/trailing whitespace

                # Write the index, filename, and the transcription to the CSV (MODIFIED: added base_filename)
                csv_writer.writerow([idx, base_filename, transcription_text])

            except Exception as e:
                print(f"Warning: Could not process file {txt_filepath}. Error: {e}")

    print(f"\nSuccessfully created transcriptions file at: {output_csv_path}")
    print(f"Total transcriptions written: {len(transcript_files)}")

    # Optional: Display the first few lines of the created CSV file for verification
    print("\n--- First 5 lines of the CSV file ---")
    !head -n 5 {output_csv_path}
    print("-------------------------------------")

Ensured output directory exists: ./transcriptions
Found 13100 transcript files in './wav'.


Writing CSV:   0%|          | 0/13100 [00:00<?, ?it/s]


Successfully created transcriptions file at: ./transcriptions/transcriptions_with_filenames.csv
Total transcriptions written: 13100

--- First 5 lines of the CSV file ---
index,filename,transcription
1,LJ001-0001,"Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition"
2,LJ001-0002,in being comparatively modern.
3,LJ001-0003,"For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process"
4,LJ001-0004,"produced the block books, which were the immediate predecessors of the true printed book,"
-------------------------------------
