In [10]:
import os
import json
import datasets
import random
import soundfile as sf
from collections import defaultdict

from IPython.display import Audio, display

# StressTest

In [2]:
ds = datasets.load_dataset("slprl/StressTest")

In [3]:
audio = ds['test'][0]['audio'].get_all_samples()

In [9]:
ds['test'][0]['audio'].get_all_samples()

AudioSamples:
  data (shape): torch.Size([1, 58112])
  pts_seconds: 0.0
  duration_seconds: 3.632
  sample_rate: 16000

In [11]:
with open('stresstest/stresstest.jsonl', 'w') as f:
    for data in ds['test']:
        f.write(
            json.dumps({
                'transcription': data['transcription'],
                'description': data['description'],
                'intonation': data['intonation'],
                'metadata': data['metadata'],
                'stress_pattern': data['stress_pattern'],
            }) + '\n'
        )

In [12]:
len(ds['test'])

218

In [33]:
for idx, data in enumerate(ds['test']):
    random_prompt_idx = random.randint(0, len(ds['test']) - 1)
    prompt_audio = ds['test'][random_prompt_idx]['audio'].get_all_samples()
    sf.write('stresstest/prompt/audio_{}.wav'.format(idx), prompt_audio.data.numpy()[0], prompt_audio.sample_rate)
    ground_truth_audio = data['audio'].get_all_samples()
    sf.write('stresstest/ground_truth/audio_{}.wav'.format(idx), ground_truth_audio.data.numpy()[0], ground_truth_audio.sample_rate)

# ParaSpeechCaps Holdout

In [30]:
holdout = datasets.load_dataset("ajd12342/paraspeechcaps", split="holdout")

README.md: 0.00B [00:00, ?B/s]

data/train_scaled-00000-of-00002.parquet:   0%|          | 0.00/161M [00:00<?, ?B/s]

data/train_scaled-00001-of-00002.parquet:   0%|          | 0.00/161M [00:00<?, ?B/s]

data/train_base-00000-of-00001.parquet:   0%|          | 0.00/37.9M [00:00<?, ?B/s]

data/dev-00000-of-00001.parquet:   0%|          | 0.00/3.94M [00:00<?, ?B/s]

data/holdout-00000-of-00001.parquet:   0%|          | 0.00/4.71M [00:00<?, ?B/s]

Generating train_scaled split:   0%|          | 0/924651 [00:00<?, ? examples/s]

Generating train_base split:   0%|          | 0/116516 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/11967 [00:00<?, ? examples/s]

Generating holdout split:   0%|          | 0/14756 [00:00<?, ? examples/s]

In [36]:
holdout[0]

{'source': 'voxceleb',
 'relative_audio_path': 'voxceleb2/dev/aac/id05998/WNpiaa_BtEc/00264_voicefixer.wav',
 'text_description': [' A Scottish male speaks with clear enunciation and a low-pitched, deep voice. His speech is delivered at a measured speed, and the environment in which the recording was made allows for balanced clarity.'],
 'transcription': " Two separate things. Well, three, actually. You've talked about disability and you've talked about welfare.",
 'intrinsic_tags': ['deep', 'enunciated', 'scottish'],
 'situational_tags': None,
 'basic_tags': ['environment balanced in clarity',
  'low-pitched',
  'male',
  'measured speed'],
 'all_tags': ['deep',
  'enunciated',
  'environment balanced in clarity',
  'low-pitched',
  'male',
  'measured speed',
  'scottish'],
 'speakerid': 'id05998',
 'name': 'Michael Gove',
 'duration': 5.6,
 'gender': 'male',
 'accent': 'scottish',
 'pitch': 'low-pitched',
 'speaking_rate': 'measured speed',
 'noise': 'environment balanced in clarity

In [43]:
emotions = set([
    "enthusiastic",
    "happy",
    "angry",
    "saddened",
    "awed",
    "calm",
    "anxious",
    "disgusted",
    "scared",
    "confused",
    "bored",
    "sleepy",
    "pained",
    "guilt",
    "sarcastic",
    "sympathetic",
    "admiring",
    "desirous"
])

In [44]:
usable_data = []
for data in holdout:
    for tag in data['all_tags']:
        if tag in emotions:
            usable_data.append(
                {
                    'transcription': data['transcription'],
                    'emotion': tag,
                }
            )

In [46]:
with open('paraspeechcaps_holdout.jsonl', 'w') as f:
    for data in usable_data:
        f.write(
            json.dumps(data) + '\n'
        )

# Expresso

In [2]:
dirname = '/data/group_data/li_lab/siqiouya/datasets/expresso'

In [3]:
with open(os.path.join(dirname, 'splits/test.txt'), 'r') as f:
    test_files = [line.strip() for line in f.readlines()]

In [8]:
with open(os.path.join(dirname, 'splits/train.txt'), 'r') as f:
    train_files = [line.strip() for line in f.readlines()]

In [4]:
id2transcription = {}
with open(os.path.join(dirname, 'read_transcriptions.txt'), 'r') as f:
    for line in f.readlines():
        id, transcription = line.strip().split('\t')
        id2transcription[id] = transcription

In [5]:
os.makedirs('expresso', exist_ok=True)

In [33]:
candidate_emotions = ['default', 'happy', 'sad']
expresso_data = []
for file in test_files[1:]:
    if len(file.split('\t')) == 1:
        items = file.split('_')
        speaker = items[0]
        emotion = items[1]
        if emotion not in candidate_emotions:
            continue
        emphasis = len(items) == 4 and items[2] == 'emphasis'
        audio_path = os.path.join(dirname, 'audio_48khz/read', speaker, emotion, 'base', file + '.wav')
        audio, sample_rate = sf.read(audio_path)
        sf.write('expresso/ground_truth/audio_{}.wav'.format(len(expresso_data)), audio, sample_rate)
        expresso_data.append({
            'id': file,
            'speaker': speaker,
            'emotion': emotion,
            'emphasis': emphasis,
            'transcription': id2transcription[file],
        })

In [19]:
expresso_data[0]

{'id': 'ex01_default_00358',
 'speaker': 'ex01',
 'emotion': 'default',
 'emphasis': False,
 'transcription': "Karen's in Switzerland?"}

In [20]:
speaker_emotion_emphasis_to_file = defaultdict(list)
for file in train_files[1:]:
    if len(file.split('\t')) == 1:
        items = file.split('_')
        speaker = items[0]
        emotion = items[1]
        if emotion not in candidate_emotions:
            continue
        emphasis = len(items) == 4 and items[2] == 'emphasis'
        speaker_emotion_emphasis_to_file[(speaker, emotion, emphasis)].append(file)

In [35]:
for idx, data in enumerate(expresso_data):
    speaker, emotion, emphasis = data['speaker'], data['emotion'], data['emphasis']
    
    while True:
        data['prompt_id'] = random.choice(speaker_emotion_emphasis_to_file[(speaker, emotion, emphasis)])
        file = data['prompt_id']
        audio_path = os.path.join(dirname, 'audio_48khz/read', speaker, emotion, 'base', file + '.wav')
        if os.path.exists(audio_path):
            break
    audio, sample_rate = sf.read(audio_path)
    sf.write('expresso/prompt/audio_{}.wav'.format(idx), audio, sample_rate)

In [29]:
with open('expresso/expresso.jsonl', 'w') as f:
    for data in expresso_data:
        f.write(
            json.dumps(data) + '\n'
        )