In [1]:
import pandas as pd
from pathlib import Path
import shutil

In [2]:
sentence_df = pd.read_csv("LASR_sentence_recording_material - forOishani.csv")
sentence_df = sentence_df.rename(columns={'Unnamed: 2': 'probability'})
sentence_df[['sent_num', 'prob_type']] = sentence_df['Sentence ID'].str.split('_', expand=True)
sentence_df = sentence_df[['Sentence ID', 'sent_num', 'prob_type', 'Expected Transcription', 'probability']]
sentence_df.head()

Unnamed: 0,Sentence ID,sent_num,prob_type,Expected Transcription,probability
0,1_1,1,1,The monkey ate the banana.,high prob
1,1_2,1,2,The monkey ate the shark.,low prob
2,1_3,1,3,The runkey ate the shark.,nonword
3,2_1,2,1,Mom likes to use the elevator instead of the s...,high prob
4,2_2,2,2,Pom likes to use the elevator instead of the d...,nonword


In [3]:
# get low prob sentences from df
low_ids = set(
    sentence_df.loc[sentence_df["probability"].str.strip().str.lower().eq("low prob"), "Sentence ID"]
      .astype(str)
      .str.strip()
)

print(f"Found {len(low_ids)} low-prob Sentence IDs")


Found 100 low-prob Sentence IDs


In [4]:
# find participant directories in root folder
root = Path("/Users/oishanibandopadhyay/Documents/LASR_Whisper_SAUCE/MTAA_recording")  

participant_dirs = sorted([p for p in root.iterdir() if p.is_dir() and p.name.startswith("t")])


In [5]:
# copy matching relevant wav files from the df
for pdir in participant_dirs:
    out_dir = pdir / f"{pdir.name}_low_prob"
    out_dir.mkdir(exist_ok=True)

    copied = 0
    for wav_path in pdir.glob("*.wav"):  # only wav files directly inside t1/ etc.
        fname = wav_path.name

        # Match any low-prob Sentence ID in the filename.
        # Using "sauce_{sid}_" to avoid accidental partial matches.
        if any(f"sauce_{sid}_" in fname for sid in low_ids):
            shutil.copy2(wav_path, out_dir / wav_path.name)
            copied += 1