In [1]:
%load_ext autoreload
%autoreload 2

In [13]:
import json
import torch
import torchutil
import promonet

In [4]:
# Load emotion labels
cache = promonet.CACHE_DIR / 'cremad'
with open(cache / 'emotions.json') as file:
    emotions = json.load(file)

In [30]:
def load_features(stem):
    """Load prosody features for statistical analysis"""

    # Base-2 log-Hz of pitch in voiced regions
    pitch = torch.load(cache / f'{stem}-100-viterbi-pitch.pt')
    pitch = torch.log2(pitch)
    periodicity = torch.load(cache / f'{stem}-100-viterbi-periodicity.pt')
    pitch = pitch[periodicity > promonet.VOICING_THRESHOLD]

    # Phoneme durations in milliseconds
    ppg = torch.load(cache / f'{stem}-100-ppg.pt')
    phonemes, durations = torch.unique_consecutive(ppg.argmax(dim=0), return_counts=True)
    durations = durations[phonemes != 39]
    durations = 1000 * durations * promonet.HOPSIZE / promonet.SAMPLE_RATE

    # Single-band A-weighted loudness statistics
    loudness = torch.load(cache / f'{stem}-100-loudness.pt')
    loudness = promonet.preprocess.loudness.band_average(loudness, 1)

    return pitch, durations, loudness

In [34]:
stats = {}
for stem, emotion in emotions.items():
    if emotion not in stats:
        stats[emotion] = {
            'pitch': {'mean': torchutil.metrics.Average(), 'std': torchutil.metrics.Average()},
            'duration': {'mean': torchutil.metrics.Average(), 'std': torchutil.metrics.Average()},
            'loudness': {'mean': torchutil.metrics.Average(), 'std': torchutil.metrics.Average()}}
    pitch, duration, loudness = load_features(stem)

    if torch.isnan(pitch.std()):
        continue
    
    stats[emotion]['pitch']['mean'].update(pitch, pitch.numel())
    stats[emotion]['pitch']['std'].update(pitch.std(), 1)
    stats[emotion]['duration']['mean'].update(duration, duration.numel())
    stats[emotion]['duration']['std'].update(duration.std(), 1)
    stats[emotion]['loudness']['mean'].update(loudness, loudness.numel())
    stats[emotion]['loudness']['std'].update(loudness.std(), 1)

In [35]:
for emotion, results in stats.items():
    for feature, meanstd in results.items():
        for stat, result in meanstd.items():
            print(f'{emotion}-{feature}-{stat}: {result()}')

anger-pitch-mean: 7.732715606689453
anger-pitch-std: 0.20265233516693115
anger-duration-mean: 101.9418716430664
anger-duration-std: 67.07918548583984
anger-loudness-mean: -50.486942291259766
anger-loudness-std: 7.912267208099365
disgust-pitch-mean: 7.2766900062561035
disgust-pitch-std: 0.18125402927398682
disgust-duration-mean: 97.39627838134766
disgust-duration-std: 66.36701965332031
disgust-loudness-mean: -53.09292984008789
disgust-loudness-std: 5.4705986976623535
fear-pitch-mean: 7.593791484832764
fear-pitch-std: 0.16084131598472595
fear-duration-mean: 81.78904724121094
fear-duration-std: 54.02749252319336
fear-loudness-mean: -52.417057037353516
fear-loudness-std: 5.748106479644775
happy-pitch-mean: 7.5791239738464355
happy-pitch-std: 0.21013355255126953
happy-duration-mean: 82.83629608154297
happy-duration-std: 55.86198806762695
happy-loudness-mean: -53.659996032714844
happy-loudness-std: 6.294555187225342
neutral-pitch-mean: 7.199605464935303
neutral-pitch-std: 0.1627632975578308


In [8]:
# Separate utterances by speaker and emotion
speakers = {}
for stem, emotion in emotions.items():
    speaker, utterance = stem.split('/')
    if speaker in speakers:
        if emotion in speakers[speaker]:
            speakers[speaker][emotion].append(utterance)
        else:
            speakers[speaker][emotion] = [utterance]
    else:
        speakers[speaker] = {emotion: [utterance]}

In [20]:
# Compute prosody statistics for each speaker and emotion
stats = {}
pitch_stats = (torchutil.metrics.Average(), torchutil.metrics.Average())
duration_stats = (torchutil.metrics.Average(), torchutil.metrics.Average())
loudness_stats = (torchutil.metrics.Average(), torchutil.metrics.Average())
for speaker, groups in speakers.items():
    for emotion, utterances in groups.items():
        group = f'{speaker}-{emotion}'
        pitch_stats[0].reset(), pitch_stats[1].reset()
        duration_stats[0].reset(), duration_stats[1].reset()
        loudness_stats[0].reset(), loudness_stats[1].reset()
        for utterance in utterances:
            pitch, duration, loudness = load_features(f'{speaker}/{utterance}')
            pitch_stats[0].update(pitch, pitch.numel())
            pitch_stats[1].update(pitch.std(), 1)
            duration_stats[0].update(durations, durations.numel())
            duration_stats[1].update(durations.std(), 1)
            loudness_stats[0].update(loudness, loudness.numel())
            loudness_stats[1].update(loudness.std(), 1)
        stats[group] = {
            'pitch': {'mean': pitch_stats[0](), 'std': pitch_stats[1]()},
            'duration': {'mean': duration_stats[0](), 'std': duration_stats[1]()},
            'loudness': {'mean': loudness_stats[0](), 'std': loudness_stats[1]()}}

In [21]:
stats

{'0000-anger': {'pitch': {'mean': 7.052194118499756,
   'std': 0.17261376976966858},
  'duration': {'mean': 90.41053771972656, 'std': 60.23238754272461},
  'loudness': {'mean': -53.267921447753906, 'std': 8.784770011901855}},
 '0000-disgust': {'pitch': {'mean': 6.833364963531494,
   'std': 0.20573818683624268},
  'duration': {'mean': 74.22840881347656, 'std': 50.25320816040039},
  'loudness': {'mean': -54.69639587402344, 'std': 7.695345878601074}},
 '0000-fear': {'pitch': {'mean': 7.386470794677734,
   'std': 0.17463408410549164},
  'duration': {'mean': 70.11161804199219, 'std': 46.2076416015625},
  'loudness': {'mean': -52.24797058105469, 'std': 9.423333168029785}},
 '0000-happy': {'pitch': {'mean': 7.369772434234619,
   'std': 0.2127230018377304},
  'duration': {'mean': 66.10755920410156, 'std': 43.044654846191406},
  'loudness': {'mean': -53.0556755065918, 'std': 9.049270629882812}},
 '0000-neutral': {'pitch': {'mean': 6.863433361053467,
   'std': 0.17527322471141815},
  'duration':