In [13]:
import os, sys, glob
import pandas as pd

In [26]:
# check for numbers of completed sessions
tag = 'base-whisperx'
model_out = 'whisperx_out'
outputs = glob.glob(f'/home1/rdehaan/projects/automated_annotation/results/{tag}/data/eeg/scalp/ltp/ltpFR2/*/session_*/{model_out}/*.csv')
print('Number of processed .wav files:', len(outputs))

Number of processed .wav files: 2354


In [2]:
# load session index with environment that has cmlreaders (can't install cmlreaders for py3.10)
if sys.version.split(' ')[0] >= '3.10' and sys.version.split(' ')[0] < '3.11':
    exps = ['ltpFR2']
    index_df = pd.read_csv(f'session_index_{"-".join(exps)}.csv')
else:
    import cmlreaders as cml

    index_df = cml.get_data_index('ltp')
    print('Available experiments:\n', index_df.experiment.unique())
    index_df = index_df.query('experiment in @exps')
    index_df.to_csv(f'session_index_{"-".join(exps)}.csv')

    load example session events
    sess_df = index_df.iloc[25]
    r = cml.CMLReader(subject=sess_df.subject, session=sess_df.session, experiment=sess_df.experiment)
    evs = r.load('events')
    print('Event types:\n', evs.type.unique())

In [3]:
def get_folders_containing_files(directory=".", pattern="*"):
    """
    Get folders containing files that match the specified glob pattern.
    
    Parameters:
    - directory (str): The starting directory for the search.
    - pattern (str): The glob pattern to search for. e.g. "*.wav", "audio_*.mp3", "document_?.txt", etc.

    Returns:
    - list: A list of folders containing files that match the glob pattern.
    """

    # Get all the files matching the pattern in the directory and its sub-directories.
    matching_files = glob.glob(os.path.join(directory, '**', pattern), recursive=True)
    
    # Get the unique directories containing the files that match the pattern.
    folders = set(os.path.dirname(file) for file in matching_files)

    return list(folders)


# build train-val-test split
from sklearn.model_selection import GroupShuffleSplit

random_seed = 42
train_prop = 0.5
val_prop = 0.2
test_prop = 0.3
assert train_prop + val_prop + test_prop == 1

index_df['split_group'] = index_df['subject'] + '_' + index_df['experiment']
train_val_idx, test_idx = next(GroupShuffleSplit(train_size=train_prop + val_prop, 
                                                 test_size=test_prop, 
                                                 random_state=random_seed).split(index_df,
                                                                                 groups=index_df['split_group']))
train_idx, val_idx = next(GroupShuffleSplit(train_size=train_prop / (train_prop + val_prop),
                                            test_size=val_prop / (train_prop + val_prop),
                                            random_state=random_seed + 1).split(index_df.iloc[train_val_idx],
                                                                                groups=index_df.iloc[train_val_idx]['split_group']))
index_dfs = {'train': index_df.iloc[train_val_idx].iloc[train_idx],
             'val': index_df.iloc[train_val_idx].iloc[val_idx],
             'test': index_df.iloc[test_idx]}

# assert no overlap across splits
assert not set(index_dfs['train'].split_group).intersection(index_dfs['test'].split_group)
assert not set(index_dfs['train'].split_group).intersection(index_dfs['val'].split_group)
assert not set(index_dfs['val'].split_group).intersection(index_dfs['test'].split_group)

# obtain session data directories
input_dirs = {'train': list(), 'val': list(), 'test': list()}
for exp in exps:
    # get all session directories containing .wav files
    wav_dirs = get_folders_containing_files(pattern="0.wav", directory=f'/data/eeg/scalp/ltp/{exp}')
    # drop sessions marked bad
    wav_dirs = {d for d in wav_dirs if not 'bad' in d.lower()}
    for split in input_dirs:
        # keep only sessions that were processed through event_creation for quality control
        exp_dirs = {f'/data/eeg/scalp/ltp/{sess.experiment}/{sess.subject}/session_{sess.session}' 
                    for _, sess in index_dfs[split].iterrows()}
        exp_dirs = exp_dirs.intersection(wav_dirs)
        input_dirs[split].extend(list(exp_dirs))

for split in input_dirs:
    print(f'{split} sessions available: {len(input_dirs[split])}')
print('Example session directories:')
input_dirs['train'][:5]

train sessions available: 1125
val sessions available: 536
test sessions available: 808
Example session directories:


['/data/eeg/scalp/ltp/ltpFR2/LTP299/session_5',
 '/data/eeg/scalp/ltp/ltpFR2/LTP334/session_17',
 '/data/eeg/scalp/ltp/ltpFR2/LTP317/session_13',
 '/data/eeg/scalp/ltp/ltpFR2/LTP358/session_6',
 '/data/eeg/scalp/ltp/ltpFR2/LTP242/session_0']

In [4]:
os.getcwd()

'/home1/rdehaan/projects/automated_annotation'

In [5]:
# obtain session processing output directories
base_dir = os.getcwd()

tags = ['base-whisperx']
all_output_dirs = dict()
all_input_dirs = dict()
output_dirs = dict()

for tag in tags:
    output_dirs[tag] = dict()
    all_input_dirs[tag] = list()
    all_output_dirs[tag] = list()
    for split in input_dirs:
        output_dirs[tag][split] = [os.getcwd() + f'/results/{tag}' + d if os.path.isabs(d) else os.path.join('results', tag, d)
                                   for d in input_dirs[split]]
        all_output_dirs[tag].extend(output_dirs[tag][split])
        all_input_dirs[tag].extend(input_dirs[split])

In [6]:
len(all_input_dirs[tag])

2469

In [7]:
import os
# os.chdir('/home1/rdehaan/projects/automated_annotation')
os.getcwd()

'/home1/rdehaan/projects/automated_annotation'

In [8]:
all_input_dirs[tag][0]

'/data/eeg/scalp/ltp/ltpFR2/LTP299/session_5'

In [12]:
import pickle
with open('input_dirs.pkl', 'wb') as f:
    pickle.dump(all_input_dirs, f)
with open('output_dirs.pkl', 'wb') as f:
    pickle.dump(all_output_dirs, f)
    

In [10]:
from automated_annot import run_whisperx
run_whisperx(all_input_dirs[tag][0], all_output_dirs[tag][0])

starting run_whisperx with
/data/eeg/scalp/ltp/ltpFR2/LTP299/session_5 /home1/rdehaan/projects/automated_annotation/results/base-whisperx/data/eeg/scalp/ltp/ltpFR2/LTP299/session_5


No language specified, language will be first be detected for each audio file (increases inference time).


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.1.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../.cache/torch/whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.0.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.0.1. Bad things might happen unless you revert torch to 1.x.
WAV files found:


Processing 16.wav...
loading audio /data/eeg/scalp/ltp/ltpFR2/LTP299/session_5/16.wav


FileNotFoundError: [Errno 2] No such file or directory: 'ffmpeg'

In [29]:
from automated_annot import run_whisperx
from cmldask import CMLDask
from dask.distributed import wait

tag = 'base-whisperx'
dask_args = {'job_name': 'auto_annotate', 'memory_per_job': "9GB", 'max_n_jobs': 35,
            'death_timeout': 600, 'extra': ['--no-dashboard'], 'log_directory': 'logs'}

client = CMLDask.new_dask_client_slurm(**dask_args)
dask_inputs = [all_input_dirs[tag][:1], all_output_dirs[tag][:1]]
futures = client.map(run_whisperx, *dask_inputs)
wait(futures)

Unique port for rdehaan is 51474
{'dashboard_address': ':51474'}
To view the dashboard, run: 
`ssh -fN rdehaan@rhino2.psych.upenn.edu -L 8000:192.168.86.140:44068` in your local computer's terminal (NOT rhino) 
and then navigate to localhost:8000 in your browser
starting run_whisperx with
/data/eeg/scalp/ltp/ltpFR2/LTP301/session_4 /home1/rdehaan/projects/automated_annotation/results/base-whisperx/data/eeg/scalp/ltp/ltpFR2/LTP301/session_4


WAV files found:


Processing 16.wav...
loading audio /data/eeg/scalp/ltp/ltpFR2/LTP301/session_4/16.wav


In [26]:
del client

In [16]:
# estimated number of days to run whisperx over data set assuming ~1 minute per list recording
n_sess = len(all_input_dirs[tag])
n_lists = 24
n_cores = 35
n_sess * n_lists / (n_cores * 24 * 60)

1.1757142857142857