In [2]:
import librosa
import torch
import time
import pandas as pd
import soundfile as sf
import torch.nn as nn
import torch.nn.functional as F

from datetime import datetime, timezone
from transformers import AutoConfig, Wav2Vec2FeatureExtractor, HubertPreTrainedModel, HubertModel


model_name  = "xmj2002/hubert-base-ch-speech-emotion-recognition"
duration    = 15
sample_rate = 16000
model_id    = 150

config = AutoConfig.from_pretrained(
    pretrained_model_name_or_path=model_name,
)



class HubertForSpeechClassification(HubertPreTrainedModel):
    """
    Hubert model for speech classification.

    Args:
        config (HubertConfig): The model configuration class instance.

    Attributes:
        hubert (HubertModel): The Hubert model.
        classifier (HubertClassificationHead): The classification head.
    """

    def __init__(self, config):
        super().__init__(config)
        self.hubert = HubertModel(config)
        self.classifier = HubertClassificationHead(config)
        self.init_weights()

    def forward(self, x):
        """
        Forward pass of the HubertForSpeechClassification model.

        Args:
            x (torch.Tensor): The input tensor.

        Returns:
            torch.Tensor: The output tensor.
        """
        outputs = self.hubert(x)
        hidden_states = outputs[0]
        x = torch.mean(hidden_states, dim=1)
        x = self.classifier(x)
        return x


class HubertClassificationHead(nn.Module):
    def __init__(self, config):
        """
        Initializes the HubertClassificationHead module.

        Args:
            config (object): Configuration object containing the model's hyperparameters.
        """
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.classifier_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_class)

    def forward(self, x):
        """
        Performs forward pass of the HubertClassificationHead module.

        Args:
            x (tensor): Input tensor.

        Returns:
            tensor: Output tensor.
        """
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x


processor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = HubertForSpeechClassification.from_pretrained(
            model_name,
            config=config,
        )

Some weights of the model checkpoint at xmj2002/hubert-base-ch-speech-emotion-recognition were not used when initializing HubertForSpeechClassification: ['hubert.encoder.pos_conv_embed.conv.weight_g', 'hubert.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertForSpeechClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertForSpeechClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertForSpeechClassification were not initialized from the model checkpoint at xmj2002/hubert-base-ch-speech-emotion-recognition and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encod

In [4]:
def id2class(id):
    """
    Convert class ID to corresponding emotion class.

    Args:
        id (int): Class ID.

    Returns:
        str: Emotion class.
    """
    if id == 0:
        return "angry"
    elif id == 1:
        return "fear"
    elif id == 2:
        return "happy"
    elif id == 3:
        return "neutral"
    elif id == 4:
        return "sadness"
    else:
        return "excited"


def predict(audio_path):
    """
    Predict the emotion class of an audio file.

    Args:
        audio_path (str): Path to the audio file.
        processor: Feature extractor for audio processing.
        model: Speech classification model.

    Returns:
        str: Emotion class prediction.
    """
    speech, sr  = librosa.load(path=audio_path, sr=sample_rate)
    speech      = processor(speech, padding="max_length", truncation=True, max_length=duration * sr, return_tensors="pt", sampling_rate=sr).input_values

    with torch.no_grad():
        logit = model(speech)

    score   = F.softmax(logit, dim=1).detach().cpu().numpy()[0]
    id      = torch.argmax(logit).cpu().numpy()

    return id2class(id), score[id]


def predict_emotion(audio_buffer):
    """
    Predict the emotion class of an audio file.

    Args:
        audio_path (str): Path to the audio file.

    Returns:
        str: Emotion class prediction.
    """
    stress_result, confidence_value = predict(audio_buffer)
    return stress_result, confidence_value


def get_audio_duration(data):
    """
    Get the duration of an audio file.

    Args:
        audio_path (str): Path to the audio file.

    Returns:
        float: Duration of the audio file in minutes.
    """
    duration = round((len(data) / sample_rate)/60, 2)
    return duration

In [5]:
audio_path = '../data/2024-04-22-tait/14senjataapi-1_95Odpk99 1.mp3'
emotion_result, confidence_value  = predict_emotion(audio_path)
emotion_result, confidence_value

('angry', 0.80610543)

In [6]:
audio_duration  = get_audio_duration(audio_path)
audio_duration

0.0

In [10]:
class StressAnalysisGenerator:
    """
    Class for generating stress analysis results from audio files.
    """

    def __init__(self):
        self.processor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
        self.model = HubertForSpeechClassification.from_pretrained(
            model_name,
            config=config,
        )
    

    def id2class(self, id):
        """
        Convert class ID to corresponding emotion class.

        Args:
            id (int): Class ID.

        Returns:
            str: Emotion class.
        """
        if id == 0:
            return "angry"
        elif id == 1:
            return "fear"
        elif id == 2:
            return "happy"
        elif id == 3:
            return "neutral"
        elif id == 4:
            return "sadness"
        else:
            return "excited"


    def predict(self, audio_path):
        """
        Predict the emotion class of an audio file.

        Args:
            audio_path (str): Path to the audio file.
            processor: Feature extractor for audio processing.
            model: Speech classification model.

        Returns:
            str: Emotion class prediction.
        """
        speech, sr  = librosa.load(path=audio_path, sr=sample_rate)
        speech      = self.processor(speech, padding="max_length", truncation=True, max_length=duration * sr, return_tensors="pt", sampling_rate=sr).input_values

        with torch.no_grad():
            logit = self.model(speech)

        score   = F.softmax(logit, dim=1).detach().cpu().numpy()[0]
        id      = torch.argmax(logit).cpu().numpy()

        return self.id2class(id), score[id]
    
    
    def predict_emotion(self, audio_buffer):
        """
        Predict the emotion class of an audio file.

        Args:
            audio_path (str): Path to the audio file.

        Returns:
            str: Emotion class prediction.
        """
        stress_result, confidence_value = self.predict(audio_buffer)
        return stress_result, confidence_value


    def get_audio_duration(self, audio_path):
        """
        Get the duration of an audio file.

        Args:
            audio_path (str): Path to the audio file.

        Returns:
            float: Duration of the audio file in minutes.
        """
        data, sample_rate = sf.read(audio_path)
        duration = round((len(data) / sample_rate)/60, 2)
        return duration
    
    def transcribe(self, audio_buffer):
        """
        Transcribe an audio file and store the results in a database.

        Args:
            audio_path (str): Path to the audio file.
            correlation_id (str): Correlation ID for tracking purposes.

        Raises:
            ValueError: If the audio file format is not supported.

        """
        startimestamp   = int(time.time())
        start_time      = str(datetime.now(tz=timezone.utc))[:10] + 'T' + str(datetime.now(tz=timezone.utc))[11:19]
        emotion_result, confidence_value  = self.predict_emotion(audio_buffer)
        audio_duration  = self.get_audio_duration(audio_buffer)
        finish_time     = str(datetime.now(tz=timezone.utc))[:10] + 'T' + str(datetime.now(tz=timezone.utc))[11:19]
        duration        = round((int(time.time()) - startimestamp) / 60, 2)
        df              = pd.DataFrame([[emotion_result, round(confidence_value, 2), audio_duration, start_time, finish_time, duration, datetime.now(tz=timezone.utc)]], 
                        columns=['emotion_result', 'confidence_value', 'audio_duration', 'start_time', 'finish_time', 'sa_duration', 'inserted_at'])
        return df

In [11]:
a = StressAnalysisGenerator()
df = a.transcribe(audio_path)
df

Some weights of the model checkpoint at xmj2002/hubert-base-ch-speech-emotion-recognition were not used when initializing HubertForSpeechClassification: ['hubert.encoder.pos_conv_embed.conv.weight_g', 'hubert.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertForSpeechClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertForSpeechClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertForSpeechClassification were not initialized from the model checkpoint at xmj2002/hubert-base-ch-speech-emotion-recognition and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert.encod

Unnamed: 0,emotion_result,confidence_value,audio_duration,start_time,finish_time,sa_duration,inserted_at
0,angry,0.81,0.76,2024-06-12T01:24:47,2024-06-12T01:24:48,0.02,2024-06-12 01:24:48.338073+00:00
