In [None]:
# import libraries
import torch
import random
import numpy as np
from transformers import Wav2Vec2Model, Wav2Vec2FeatureExtractor
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn as nn
import torch.nn.functional as F
import librosa
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
!pip install datasets
from datasets import load_dataset
!pip install gradio
import matplotlib.pyplot as plt
import gradio as gr

In [5]:
#Mojo of reproducibility
def set_seed(seed):
  #PyTorch
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  #Numpy
  np.random.seed(seed)
  #Python_random
  random.seed(seed)
  #CuDNN (when using CUDA)
  if torch.cuda.is_available():
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

# Define the audio to emotion model using Wav2Vec2
class wav2Vec_transferModel(nn.Module):
    def __init__(self, pretrained_model_name="r-f/wav2vec-english-speech-emotion-recognition"):
        super().__init__()

        # Load pre-trained model
        self.model = Wav2Vec2Model.from_pretrained(pretrained_model_name)

        # Freeze the pre-trained Wav2Vec2 model
        # Only the last classifying layers will be trained
        for param in self.model.parameters():
            param.requires_grad = False

        # Define dropout layer to avoid overfitting in the classifier
        self.dropout = nn.Dropout(0.2)

        # Attaching a new classifying layers for emotion label and emotion strength
        self.classifier_emo = nn.Sequential(
            nn.Linear(self.model.config.hidden_size, 128),
            # nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Linear(128, 128),
            # nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64,32),
            # nn.BatchNorm1d(32),
            nn.ReLU(),
            self.dropout,
            nn.Linear(32, 6)
        )

        self.classifier_strength = nn.Sequential(
            nn.Linear(self.model.config.hidden_size, 128),
            # nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Linear(128, 64),
            # nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Linear(64, 32),
            # nn.BatchNorm1d(32),
            nn.ReLU(),
            self.dropout,
            nn.Linear(32, 3)
        )

        # Initialize weights by Xavier initialization
        self.init_weights()

    def init_weights(self):
        for layer in self.modules():
            if isinstance(layer, nn.Linear):
                nn.init.xavier_uniform_(layer.weight)
                if layer.bias is not None:
                    nn.init.zeros_(layer.bias)


    def forward(self, inputs):
        # Forward pass through the Wav2Vec2 model to extract the hidden states
        # Disable gradient calculation for the Wav2Vec2 model part
        # Input is a feature extracted by Wav2Vec2FeatureExtractor beforehand
        input_values = inputs['input_values'].squeeze(1)
        attention_mask = inputs['attention_mask'].squeeze(1)  # if needed


        with torch.no_grad():
            outputs = self.model(input_values=input_values, attention_mask=attention_mask)
            # Extract the last hidden state.
            # The last hidden state is a tensor of shape (batch_size, sequence_length, hidden_size)
            # Use mean pooling over the sequenth length to get a fixed-size representation
            last_hidden_state_pooled = outputs.last_hidden_state.mean(1)

        # Pass the hidden states through the classifier
        logits_emo = self.classifier_emo(last_hidden_state_pooled)
        logits_strength = self.classifier_strength(last_hidden_state_pooled)

        # Return the logits for both tasks
        return logits_emo, logits_strength

# emotion detection model
class emotionDetectionSystem():
    def __init__(self, useAudio = True, useText = True):
        self.useAudio = useAudio
        self.useText = useText
        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"

        # If using audio, Load the Wav2Vec2 model for audio emotion recognition
        if self.useAudio:
            self.wav2vec_modelName = "r-f/wav2vec-english-speech-emotion-recognition"
            self.wav2vec_model = wav2Vec_transferModel()
            self.wav2vec_model.load_state_dict(torch.load('bestWav2Vec_balanced.pth'))
            self.wav2vec_model.eval()
            self.wav2vec_model.to(self.device)
            self.wav2vec_model.requires_grad = False

        # If using text, Load the BERT model for text emotion recognition
        if self.useText:
            self.textClassifier_model_name = "michellejieli/emotion_text_classifier"
            self.tokenizer = AutoTokenizer.from_pretrained(self.textClassifier_model_name)
            self.textModel = AutoModelForSequenceClassification.from_pretrained(self.textClassifier_model_name)
            self.textModel.to(self.device)
            self.textModel.eval()
            self.textModel.requires_grad = False

            # Also load the dictation model
            self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
            self.transcriptModel_id = "openai/whisper-medium"

            self.transcriptModel = AutoModelForSpeechSeq2Seq.from_pretrained(self.transcriptModel_id,
                                                                             torch_dtype=self.torch_dtype,
                                                                             low_cpu_mem_usage=True,
                                                                             use_safetensors=True)
            self.transcriptModel.to(self.device)

            self.transcriptProcessor = AutoProcessor.from_pretrained(self.transcriptModel_id)

            self.transcriptPipe = pipeline(
                "automatic-speech-recognition",
                model=self.transcriptModel,
                tokenizer=self.transcriptProcessor.tokenizer,
                feature_extractor=self.transcriptProcessor.feature_extractor,
                torch_dtype=self.torch_dtype,
                device=self.device,
            )


    def predict(self, input = None):
        # audio, sr = librosa.load(audio_path, sr=16000)
        sr, audio = input

        if self.useAudio:
            audio = torch.tensor(audio)
            feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(self.wav2vec_modelName)
            audioInput = feature_extractor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
            audioInput = audioInput.to(self.device)
            with torch.no_grad():
                audio_logits_emo, audio_logits_strength = self.wav2vec_model(audioInput)
                audio_probs_emo = F.softmax(audio_logits_emo, dim=1)
                audio_probs_strength = F.softmax(audio_logits_strength, dim=1)

        if self.useText:
            transcript = self.transcriptPipe(np.array(audio))
            transcript = transcript["text"]
            inputs = self.tokenizer(transcript, return_tensors="pt", padding=True, truncation=True)
            inputs = inputs.to(self.device)
            with torch.no_grad():
                text_logits_emo = self.textModel(**inputs).logits
                text_probs_emo_raw = F.softmax(text_logits_emo, dim=1)[0]
                # Re-order the probabilities to match the output of the audio model
                # Please check if the order is correct!!!
                text_emo_probs_reordered = torch.tensor([text_probs_emo_raw[4],
                                                         text_probs_emo_raw[0],
                                                         text_probs_emo_raw[1],
                                                         #Adding the probability of "Surprise" to "Fear"
                                                         text_probs_emo_raw[2] + text_probs_emo_raw[6],
                                                         text_probs_emo_raw[5],
                                                         text_probs_emo_raw[3]]
                                                        )
                text_emo_probs_reordered = text_emo_probs_reordered.to(self.device)


        if self.useAudio and self.useText:
            # Average the probabilities from both models
            emo_probs = (audio_probs_emo + text_emo_probs_reordered) / 2
            strength_probs = audio_probs_strength

        elif self.useAudio:
            emo_probs = audio_probs_emo
            strength_probs = audio_probs_strength
        elif self.useText:
            emo_probs = text_emo_probs_reordered
            strength_probs = None
        else:
            raise ValueError("At least one of useAudio or useText must be True.")

        emoList = ["Neutral", "Anger", "Disgust", "Fear", "Sadness", "Happiness"]
        strengthList = ["Weak", "Medium", "Strong"]

        # Get the predicted emotion and strength
        emo_pred = emoList[torch.argmax(emo_probs).item()]
        strength_pred = strengthList[torch.argmax(strength_probs).item()] if strength_probs is not None else None

        print(f"Predicted Emotion: {emo_pred}")
        if strength_pred is not None:
            print(f"Predicted Strength: {strength_pred}")

        pred_dict = {}
        for i,j in zip(emoList, emo_probs[0]):
          pred_dict[i] = float(j)

        return pred_dict, strength_pred
        # return emo_pred, strength_pred

emoclassifier = emotionDetectionSystem()

def emotionClassify(p):
  return emoclassifier.predict(p)

input_audio = gr.Audio(label="Upload Audio for Emotion Classification")
output_label = gr.Label(label = 'Predicted Emotions Probablities')

demo = gr.Interface(
    fn=emotionClassify,
    inputs=input_audio,
    outputs= [output_label, gr.Textbox(label="Prediction Strength")]
)

In [55]:
demo.launch()

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://724b5fabbe083c27e7.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Predicted Emotion: Neutral
Predicted Strength: Strong
Predicted Emotion: Neutral
Predicted Strength: Strong
Predicted Emotion: Neutral
Predicted Strength: Strong
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://724b5fabbe083c27e7.gradio.live


