In [118]:
import os
from glob import glob
# from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2ForCTC
from transformers import AutoTokenizer, AutoFeatureExtractor, AutoModelForCTC, Wav2Vec2Processor
import torchaudio
import soundfile as sf
import librosa
import torch.functional as F
import torch.nn as nn
from IPython.display import Audio, display, Markdown
import torch

import pandas as pd
import random

In [36]:
# import model, feature extractor, tokenizer
model = AutoModelForCTC.from_pretrained("facebook/wav2vec2-base-960h")
tokenizer = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize


In [37]:
metadata = {
    "angry": sorted(glob(r"data/audio-emotions/Angry/*.wav")),
    "sad": sorted(glob(r"data/audio-emotions/Sad/*.wav")),
    "disgusted": sorted(glob(r"data/audio-emotions/Disgusted/*.wav")),
    "fearful": sorted(glob(r"data/audio-emotions/Fearful/*.wav")),
    "happy": sorted(glob(r"data/audio-emotions/Happy/*.wav")),
    "neutral": sorted(glob(r"data/audio-emotions/Neutral/*.wav")),
    "surprised": sorted(glob(r"data/audio-emotions/Surprised/*.wav")),
}

TRAIN_SPLIT = 0.8

train_meta = {
    "angry": metadata["angry"][:int(TRAIN_SPLIT * len(metadata["angry"]))],
    "sad": metadata["sad"][:int(TRAIN_SPLIT * len(metadata["sad"]))],
    "disgusted": metadata["disgusted"][:int(TRAIN_SPLIT * len(metadata["disgusted"]))],
    "fearful": metadata["fearful"][:int(TRAIN_SPLIT * len(metadata["fearful"]))],
    "happy": metadata["happy"][:int(TRAIN_SPLIT * len(metadata["happy"]))],
    "neutral": metadata["neutral"][:int(TRAIN_SPLIT * len(metadata["neutral"]))],
    "surprised": metadata["surprised"][:int(TRAIN_SPLIT * len(metadata["surprised"]))],
}

test_meta = {
    "angry": metadata["angry"][int(TRAIN_SPLIT * len(metadata["angry"])):],
    "sad": metadata["sad"][int(TRAIN_SPLIT * len(metadata["sad"])):],
    "disgusted": metadata["disgusted"][int(TRAIN_SPLIT * len(metadata["disgusted"])):],
    "fearful": metadata["fearful"][int(TRAIN_SPLIT * len(metadata["fearful"])):],
    "happy": metadata["happy"][int(TRAIN_SPLIT * len(metadata["happy"])):],
    "neutral": metadata["neutral"][int(TRAIN_SPLIT * len(metadata["neutral"])):],
    "surprised": metadata["surprised"][int(TRAIN_SPLIT * len(metadata["surprised"])):],
}
        

In [86]:
train_df = []
test_df = []

for emotion in train_meta:
    for file in train_meta[emotion]:
        train_df.append((file, emotion))
        
for emotion in test_meta:
    for file in test_meta[emotion]:
        test_df.append((file, emotion))
        
random.shuffle(train_df)
random.shuffle(test_df)


In [157]:
# create pandas dataframes
train_df = pd.DataFrame(train_df, columns=["file", "emotion"])
test_df = pd.DataFrame(test_df, columns=["file", "emotion"])

# apply feature extractor to all files
# train_df = train_df.head()
# test_df = test_df.head()
# flattener = nn.Flatten()
def extract_features(file):
    # waveform, sample_rate = sf.read(file)
    waveform, sample_rate = librosa.load(file, sr=16000)
    # input_values = tokenizer(waveform, sampling_rate=sample_rate, return_tensors="pt").input_values
    # sampling_rate=sample_rate
    features = feature_extractor(waveform, sampling_rate=sample_rate, return_tensors="np").input_values
    
    # features = features[0]
    print(f"{features.shape = }")
    return features.tolist()


In [158]:

train_df = train_df.head()
test_df = test_df.head()
train_df["features"] = train_df["file"].apply(extract_features)
test_df["features"] = test_df["file"].apply(extract_features)
train_df.head()

features.shape = (1, 41108)
features.shape = (1, 44311)
features.shape = (1, 31559)
features.shape = (1, 42709)
features.shape = (1, 54455)
features.shape = (1, 37468)
features.shape = (1, 57024)
features.shape = (1, 25322)
features.shape = (1, 70440)
features.shape = (1, 40389)


Unnamed: 0,file,emotion,features
0,data/audio-emotions/Angry/1041_MTI_ANG_XX.wav,angry,"[[0.010177013464272022, 0.005764135625213385, ..."
1,data/audio-emotions/Angry/1089_ITS_ANG_XX.wav,angry,"[[0.05511103942990303, 0.05671730265021324, 0...."
2,data/audio-emotions/Neutral/OAF_beg_neutral.wav,neutral,"[[0.023812657222151756, 0.10748444497585297, 0..."
3,data/audio-emotions/Fearful/1041_TSI_FEA_XX.wav,fearful,"[[0.4287394881248474, 0.45389434695243835, 0.4..."
4,data/audio-emotions/Fearful/03-01-06-02-02-01-...,fearful,"[[5.374704869609559e-06, 5.36065454070922e-06,..."


In [159]:
# transcription
def transcribe(features):
    input_values = torch.tensor(features)
    logits = model(input_values).logits[0]
    predicted_ids = torch.argmax(logits, dim=-1)
    outputs = tokenizer.decode(predicted_ids, output_word_offsets=True)
    time_offset = model.config.inputs_to_logits_ratio / feature_extractor.sampling_rate
    
    word_offsets = [
        { "word": d["word"],
            "start_time": round(d["start_offset"] * time_offset, 2),
            "end_time": round(d["end_offset"] * time_offset, 2),
        }
        for d in outputs.word_offsets
    ]

    return word_offsets

In [160]:
predictions_df = train_df.copy()
predictions_df["transcription"] = predictions_df["features"].apply(transcribe)
predictions_df.head()

Unnamed: 0,file,emotion,features,transcription
0,data/audio-emotions/Angry/1041_MTI_ANG_XX.wav,angry,"[[0.010177013464272022, 0.005764135625213385, ...","[{'word': 'MAYBE', 'start_time': 0.4, 'end_tim..."
1,data/audio-emotions/Angry/1089_ITS_ANG_XX.wav,angry,"[[0.05511103942990303, 0.05671730265021324, 0....","[{'word': 'I', 'start_time': 0.62, 'end_time':..."
2,data/audio-emotions/Neutral/OAF_beg_neutral.wav,neutral,"[[0.023812657222151756, 0.10748444497585297, 0...","[{'word': 'SAY', 'start_time': 0.26, 'end_time..."
3,data/audio-emotions/Fearful/1041_TSI_FEA_XX.wav,fearful,"[[0.4287394881248474, 0.45389434695243835, 0.4...","[{'word': 'SERVICE', 'start_time': 1.12, 'end_..."
4,data/audio-emotions/Fearful/03-01-06-02-02-01-...,fearful,"[[5.374704869609559e-06, 5.36065454070922e-06,...","[{'word': 'DOGS', 'start_time': 0.98, 'end_tim..."


In [161]:
print(f"TRANSCRIPTION: {' '.join([ item['word'] for item in predictions_df['transcription'].iloc[2]])}")
Audio(train_df["file"].iloc[2], autoplay=True)

TRANSCRIPTION: SAY THE WORD BEG


In [162]:
train_df["emotion"].iloc[2]

'neutral'

In [163]:
train_df.iloc[0]["features"]

[[0.010177013464272022,
  0.005764135625213385,
  0.0028729394543915987,
  0.0007425843505188823,
  -0.003974630031734705,
  -0.003822461934760213,
  -0.006409321445971727,
  -0.010517863556742668,
  -0.012343882583081722,
  -0.0121917137876153,
  -0.016756759956479073,
  -0.015082909725606441,
  -0.01949578896164894,
  -0.016756759956479073,
  -0.02132180705666542,
  -0.018887115642428398,
  -0.015995919704437256,
  -0.021930480375885963,
  -0.01751760207116604,
  -0.01706109754741192,
  -0.018430611118674278,
  -0.011887378059327602,
  -0.01082220021635294,
  -0.011430872604250908,
  -0.008387507870793343,
  -0.007626667153090239,
  -0.0054963119328022,
  -0.002452947897836566,
  -0.003974630031734705,
  -0.003974630031734705,
  -0.008083172142505646,
  -0.007017994299530983,
  -0.006865825969725847,
  -0.006865825969725847,
  -0.007322330493479967,
  -0.004278966225683689,
  -0.003670293604955077,
  -0.0016921070637181401,
  -0.004735471215099096,
  -0.002452947897836566,
  0.001046

In [164]:
# save to csv
train_df.to_csv("data/audio_emotions-train.csv", index=False)
test_df.to_csv("data/audio_emotions-test.csv", index=False)