## Revised code for base pipeline

In [None]:
# Import essential libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import numpy as np
import librosa

# HuggingFace Transformers modules
from transformers import (
    Wav2Vec2Model, Wav2Vec2FeatureExtractor,
    AutoTokenizer, AutoModelForSequenceClassification,
    AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
)

# Set seeds for reproducibility
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    if torch.cuda.is_available():
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_seed(42)

Wav2Vec

In [1]:
# Define Wav2Vec2-based model for emotion + strength prediction
class wav2Vec_transferModel(nn.Module):
    def __init__(self, pretrained_model_name="r-f/wav2vec-english-speech-emotion-recognition"):
        super().__init__()

        # Load pre-trained Wav2Vec2 model (frozen for feature extraction)
        self.model = Wav2Vec2Model.from_pretrained(pretrained_model_name)
        for param in self.model.parameters():
            param.requires_grad = False  # Freeze all layers

        # Dropout to reduce overfitting
        self.dropout = nn.Dropout(0.2)

        # Emotion classification head
        self.classifier_emo = nn.Sequential(
            nn.Linear(self.model.config.hidden_size, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            self.dropout,
            nn.Linear(32, 6)  # 6 emotion classes
        )

        # Strength classification head
        self.classifier_strength = nn.Sequential(
            nn.Linear(self.model.config.hidden_size, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            self.dropout,
            nn.Linear(32, 3)  # 3 strength levels
        )

        self.init_weights()

    # Xavier initialization for classifier weights
    def init_weights(self):
        for layer in self.modules():
            if isinstance(layer, nn.Linear):
                nn.init.xavier_uniform_(layer.weight)
                if layer.bias is not None:
                    nn.init.zeros_(layer.bias)

    # Forward pass: returns logits for emotion and strength
    def forward(self, inputs):
        input_values = inputs['input_values'].squeeze(1)
        attention_mask = inputs['attention_mask'].squeeze(1)

        with torch.no_grad():  # Wav2Vec2 is frozen
            outputs = self.model(input_values=input_values, attention_mask=attention_mask)
            # Use mean pooling over time dimension
            last_hidden_state_pooled = outputs.last_hidden_state.mean(1)

        # Pass through both heads
        logits_emo = self.classifier_emo(last_hidden_state_pooled)
        logits_strength = self.classifier_strength(last_hidden_state_pooled)

        return logits_emo, logits_strength




audio + text

In [None]:
# Main class for full emotion detection pipeline (audio + text)
class emotionDetectionSystem():
    def __init__(self, useAudio=True, useText=True):
        self.useAudio = useAudio
        self.useText = useText
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        # Load Wav2Vec2 model for audio emotion recognition
        if self.useAudio:
            self.wav2vec_modelName = "r-f/wav2vec-english-speech-emotion-recognition"
            self.wav2vec_model = wav2Vec_transferModel()
            self.wav2vec_model.load_state_dict(torch.load('bestWav2Vec.pth', map_location=self.device))
            self.wav2vec_model.eval()
            self.wav2vec_model.to(self.device)

        # Load BERT-based text emotion classifier and Whisper transcription model
        if self.useText:
            self.textClassifier_model_name = "michellejieli/emotion_text_classifier"
            self.tokenizer = AutoTokenizer.from_pretrained(self.textClassifier_model_name)
            self.textModel = AutoModelForSequenceClassification.from_pretrained(self.textClassifier_model_name)
            self.textModel.to(self.device)
            self.textModel.eval()

            # Load Whisper for audio-to-text transcription
            self.transcriptModel_id = "openai/whisper-large-v3-turbo"
            self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

            self.transcriptModel = AutoModelForSpeechSeq2Seq.from_pretrained(
                self.transcriptModel_id, torch_dtype=self.torch_dtype,
                low_cpu_mem_usage=True, use_safetensors=True
            ).to(self.device)

            self.transcriptProcessor = AutoProcessor.from_pretrained(self.transcriptModel_id)

            # Pipeline for easy speech-to-text conversion
            self.transcriptPipe = pipeline(
                "automatic-speech-recognition",
                model=self.transcriptModel,
                tokenizer=self.transcriptProcessor.tokenizer,
                feature_extractor=self.transcriptProcessor.feature_extractor,
                torch_dtype=self.torch_dtype,
                device=0 if self.device == "cuda" else -1,
            )

    # Prediction function that runs audio + optional transcription-based emotion detection
    def predict(self, audio_path=None):
        # Load and preprocess audio to 16kHz
        audio, sr = librosa.load(audio_path, sr=16000)

        if self.useAudio:
            # Extract features using Wav2Vec2's feature extractor
            audio_tensor = torch.tensor(audio).unsqueeze(0)
            feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(self.wav2vec_modelName)
            audioInput = feature_extractor(audio_tensor, sampling_rate=16000, return_tensors="pt", padding=True).to(self.device)

            # Run emotion and strength prediction
            with torch.no_grad():
                audio_logits_emo, audio_logits_strength = self.wav2vec_model(audioInput)
                audio_probs_emo = F.softmax(audio_logits_emo, dim=1)[0]
                audio_probs_strength = F.softmax(audio_logits_strength, dim=1)[0]

        if self.useText:
            # Transcribe audio using Whisper
            transcript = self.transcriptPipe(audio_path)["text"]
            inputs = self.tokenizer(transcript, return_tensors="pt", padding=True, truncation=True).to(self.device)

            # Run text-based emotion prediction
            with torch.no_grad():
                text_logits_emo = self.textModel(**inputs).logits
                text_probs_emo_raw = F.softmax(text_logits_emo, dim=1)[0]

                # Align label order between audio and text models
                text_emo_probs_reordered = torch.tensor([
                    text_probs_emo_raw[4],  # Neutral
                    text_probs_emo_raw[0],  # Anger
                    text_probs_emo_raw[1],  # Disgust
                    text_probs_emo_raw[2] + text_probs_emo_raw[6],  # Fear + Surprise
                    text_probs_emo_raw[5],  # Sadness
                    text_probs_emo_raw[3],  # Happiness
                ], device=self.device)

        # Combine or choose available predictions
        if self.useAudio and self.useText:
            emo_probs = (audio_probs_emo + text_emo_probs_reordered) / 2
            strength_probs = audio_probs_strength
        elif self.useAudio:
            emo_probs = audio_probs_emo
            strength_probs = audio_probs_strength
        elif self.useText:
            emo_probs = text_emo_probs_reordered
            strength_probs = None
        else:
            raise ValueError("At least one of useAudio or useText must be True.")

        # Class labels
        emoList = ["Neutral", "Anger", "Disgust", "Fear", "Sadness", "Happiness"]
        strengthList = ["Weak", "Medium", "Strong"]

        # Final predicted class labels
        emo_pred = emoList[torch.argmax(emo_probs).item()]
        strength_pred = strengthList[torch.argmax(strength_probs).item()] if strength_probs is not None else None

        # Print predictions
        print(f"\nPredicted Emotion: {emo_pred}")
        if strength_pred:
            print(f"Predicted Strength: {strength_pred}")

        print("\nEmotion Probabilities:")
        for i, prob in enumerate(emo_probs):
            print(f"{emoList[i]}: {prob.item():.4f}")

        if strength_probs is not None:
            print("\nStrength Probabilities:")
            for i, prob in enumerate(strength_probs):
                print(f"{strengthList[i]}: {prob.item():.4f}")

        return emo_pred, strength_pred


## Previous Code
what I based the changes off of

In [None]:
#Mojo of reproducibility
import torch
import random
import numpy as np

def set_seed(seed):
  #PyTorch
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  #Numpy
  np.random.seed(seed)
  #Python_random
  random.seed(seed)
  #CuDNN (when using CUDA)
  if torch.cuda.is_available():
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [None]:
# Define the audio to emotion model using Wav2Vec2
class wav2Vec_transferModel(nn.Module):
    def __init__(self, pretrained_model_name="r-f/wav2vec-english-speech-emotion-recognition"):
        super().__init__()

        # Load pre-trained model
        self.model = Wav2Vec2Model.from_pretrained(pretrained_model_name)

        # Freeze the pre-trained Wav2Vec2 model
        # Only the last classifying layers will be trained
        for param in self.model.parameters():
            param.requires_grad = False

        # Define dropout layer to avoid overfitting in the classifier
        self.dropout = nn.Dropout(0.2)

        # Attaching a new classifying layers for emotion label and emotion strength
        self.classifier_emo = nn.Sequential(
            nn.Linear(self.model.config.hidden_size, 128),
            # nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Linear(128, 128),
            # nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64,32),
            # nn.BatchNorm1d(32),
            nn.ReLU(),
            self.dropout,
            nn.Linear(32, 6)
        )

        self.classifier_strength = nn.Sequential(
            nn.Linear(self.model.config.hidden_size, 128),
            # nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Linear(128, 64),
            # nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Linear(64, 32),
            # nn.BatchNorm1d(32),
            nn.ReLU(),
            self.dropout,
            nn.Linear(32, 3)
        )

        # Initialize weights by Xavier initialization
        self.init_weights()

    def init_weights(self):
        for layer in self.modules():
            if isinstance(layer, nn.Linear):
                nn.init.xavier_uniform_(layer.weight)
                if layer.bias is not None:
                    nn.init.zeros_(layer.bias)


    def forward(self, inputs):
        # Forward pass through the Wav2Vec2 model to extract the hidden states
        # Disable gradient calculation for the Wav2Vec2 model part
        # Input is a feature extracted by Wav2Vec2FeatureExtractor beforehand
        input_values = inputs['input_values'].squeeze(1)
        attention_mask = inputs['attention_mask'].squeeze(1)  # if needed


        with torch.no_grad():
            outputs = self.model(input_values=input_values, attention_mask=attention_mask)
            # Extract the last hidden state.
            # The last hidden state is a tensor of shape (batch_size, sequence_length, hidden_size)
            # Use mean pooling over the sequenth length to get a fixed-size representation
            last_hidden_state_pooled = outputs.last_hidden_state.mean(1)

        # Pass the hidden states through the classifier
        logits_emo = self.classifier_emo(last_hidden_state_pooled)
        logits_strength = self.classifier_strength(last_hidden_state_pooled)

        # Return the logits for both tasks
        return logits_emo, logits_strength

In [None]:
from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F
import librosa
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


class emotionDetectionSystem():
    def __init__(self, useAudio = True, useText = True):
        self.useAudio = useAudio
        self.useText = useText
        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"

        # If using audio, Load the Wav2Vec2 model for audio emotion recognition
        if self.useAudio:
            self.wav2vec_modelName = "r-f/wav2vec-english-speech-emotion-recognition"
            self.wav2vec_model = wav2Vec_transferModel()
            self.wav2vec_model.load_state_dict(torch.load('bestWav2Vec.pth'))
            self.wav2vec_model.eval()
            self.wav2vec_model.to(self.device)
            self.wav2vec_model.requires_grad = False

        # If using text, Load the BERT model for text emotion recognition
        if self.useText:
            self.textClassifier_model_name = "michellejieli/emotion_text_classifier"
            self.tokenizer = AutoTokenizer.from_pretrained(self.textClassifier_model_name)
            self.textModel = AutoModelForSequenceClassification.from_pretrained(self.textClassifier_model_name)
            self.textModel.to(self.device)
            self.textModel.eval()
            self.textModel.requires_grad = False

            # Also load the dictation model
            self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
            self.transcriptModel_id = "openai/whisper-large-v3-turbo"

            self.transcriptModel = AutoModelForSpeechSeq2Seq.from_pretrained(transcriptModel_id,
                                                                             torch_dtype=torch_dtype,
                                                                             low_cpu_mem_usage=True,
                                                                             use_safetensors=True)
            self.transcriptModel.to(device)

            self.transcriptProcessor = AutoProcessor.from_pretrained(transcriptModel_id)

            self.transcriptPipe = pipeline(
                "automatic-speech-recognition",
                model=self.transcriptModel,
                tokenizer=self.transcriptProcessor.tokenizer,
                feature_extractor=self.transcriptProcessor.feature_extractor,
                torch_dtype=self.torch_dtype,
                device=self.device,
            )


    def predict(self, audio_path = None):
        audio, sr = librosa.load(audio_path, sr=16000)

        if self.useAudio:
            audio = torch.tensor(audio)
            feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(self.wav2vec_modelName)
            audioInput = feature_extractor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
            audioInput = audioInput.to(device)
            with torch.no_grad():
                audio_logits_emo, audio_logits_strength = self.wav2vec_model(audioInput)
                audio_probs_emo = F.softmax(logits_emo, dim=1)
                audio_probs_strength = F.softmax(logits_strength, dim=1)

        if self.useText:
            transcript = self.transcriptPipe(audio)
            transcript = transcript["text"]
            inputs = self.tokenizer(transcript, return_tensors="pt", padding=True, truncation=True)
            inputs = inputs.to(device)
            with torch.no_grad():
                text_logits_emo = self.textModel(**inputs).logits
                text_probs_emo_raw = F.softmax(text_emo_logits, dim=1)[0]
                # Re-order the probabilities to match the output of the audio model
                # Please check if the order is correct!!!
                text_emo_probs_reordered = torch.tensor([text_probs_emo_raw[4],
                                                         text_probs_emo_raw[0],
                                                         text_probs_emo_raw[1],
                                                         #Adding the probability of "Surprise" to "Fear"
                                                         text_probs_emo_raw[2] + text_probs_emo_raw[6],
                                                         text_probs_emo_raw[5],
                                                         text_probs_emo_raw[3]]
                                                        )


        if self.useAudio and self.useText:
            # Average the probabilities from both models
            emo_probs = (audio_probs_emo + text_emo_probs_reordered) / 2
            strength_probs = audio_probs_strength

        elif self.useAudio:
            emo_probs = audio_probs_emo
            strength_probs = audio_probs_strength
        elif self.useText:
            emo_probs = text_emo_probs_reordered
            strength_probs = None
        else:
            raise ValueError("At least one of useAudio or useText must be True.")

        emoList = ["Neutral", "Anger", "Disgust", "Fear", "Sadness", "Happiness"]
        strengthList = ["Weak", "Medium", "Strong"]

        # Get the predicted emotion and strength
        emo_pred = emoList[torch.argmax(emo_probs).item()]
        strength_pred = strengthList[torch.argmax(strength_probs).item()] if strength_probs is not None else None

        print(f"Predicted Emotion: {emo_pred}")
        if strength_pred is not None:
            print(f"Predicted Strength: {strength_pred}")

        #Print out the prediction confidence
        for i, prob in enumerate(emo_probs):
            print(f"{emoList[i]}: {prob.item():.4f}")
        if strength_probs is not None:
            for i, prob in enumerate(strength_probs):
                print(f"{strengthList[i]}: {prob.item():.4f}")

        # Return the predicted emotion and strength for error analysis
        return emo_pred, strength_pred
