In [None]:
from funasr import AutoModel

model = AutoModel(model="FunAudioLLM/SenseVoiceSmall", hub="hf", device="cpu")

funasr version: 1.2.7.
Check update of funasr, and it would cost few times. You may disable it by set `disable_update=True` in AutoModel
You are using the latest version of funasr-1.2.7


Fetching 29 files: 100%|█████████████████████| 29/29 [00:00<00:00, 60304.82it/s]

Detect model requirements, begin to install it: /Users/madhusiddharthsuthagar/.cache/huggingface/hub/models--FunAudioLLM--SenseVoiceSmall/snapshots/3eb3b4eeffc2f2dde6051b853983753db33e35c3/requirements.txt





install model requirements successfully


In [72]:
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
import torch
import librosa
import math
import numpy as np

# -----------------------------
# Load model and feature extractor
# -----------------------------
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
    "r-f/wav2vec-english-speech-emotion-recognition"
)
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    "r-f/wav2vec-english-speech-emotion-recognition"
)

model.eval()

# -----------------------------
# Load audio
# -----------------------------
audio_file = "/Users/madhusiddharthsuthagar/Downloads/input_9.mp3"
waveform, rate = librosa.load(audio_file, sr=16000)
total_duration = len(waveform) / rate

# -----------------------------
# Split into non-overlapping 3-second chunks
# -----------------------------
chunk_duration = 2 # seconds
chunk_size = int(chunk_duration * rate)
num_chunks = math.ceil(len(waveform) / chunk_size)

chunks = []
chunk_times = []
for i in range(num_chunks):
    start_sample = i * chunk_size
    end_sample = start_sample + chunk_size
    chunk = waveform[start_sample:end_sample]
    
    # Skip silent/very quiet chunks
    rms = np.sqrt(np.mean(chunk**2))
    if len(chunk) > 0 and rms > 0.01:
        chunks.append(chunk)
        chunk_times.append((start_sample / rate, min(end_sample / rate, total_duration)))

# -----------------------------
# Run inference for each chunk
# -----------------------------
previous_emotion = None
confidence_threshold = 0.5  # adjust as needed

for start, end, chunk in zip([t[0] for t in chunk_times], [t[1] for t in chunk_times], chunks):
    inputs = feature_extractor(chunk, sampling_rate=rate, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = model(**inputs).logits

    probs = torch.softmax(logits.squeeze(0), dim=-1)
    top_prob, predicted_id = torch.max(probs, dim=-1)
    emotion = model.config.id2label[predicted_id.item()]

    # Only update if confidence is above threshold
    if top_prob.item() >= confidence_threshold:
        previous_emotion = emotion
    elif previous_emotion is not None:
        emotion = previous_emotion  # keep last confident emotion

    print(f"{int(start)}-{int(end)} sec: {emotion} (confidence: {top_prob.item():.2f})")

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at r-f/wav2vec-english-speech-emotion-recognition and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0-2 sec: disgust (confidence: 0.16)
2-4 sec: disgust (confidence: 0.15)
4-4 sec: sad (confidence: 0.15)


In [85]:
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
import torch
import librosa
import math
import numpy as np
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# -----------------------------
# Multiple Model Ensemble Approach
# -----------------------------

class MultiModelEmotionRecognizer:
    def __init__(self):
        self.models = {}
        self.feature_extractors = {}
        self.model_weights = {}
        
        # Load multiple models for ensemble
        self.load_models()
    
    def load_models(self):
        """Load multiple pre-trained models"""
        model_configs = [
            {
                'name': 'r-f/wav2vec-english-speech-emotion-recognition',
                'weight': 0.4
            },
            {
                'name': 'ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition',
                'weight': 0.35
            },
            {
                'name': 'speechbrain/emotion-recognition-wav2vec2-IEMOCAP',
                'weight': 0.25
            }
        ]
        
        for config in model_configs:
            try:
                model_name = config['name']
                print(f"Loading {model_name}...")
                
                self.feature_extractors[model_name] = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
                self.models[model_name] = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
                self.models[model_name].eval()
                self.model_weights[model_name] = config['weight']
                
                print(f"✓ Loaded {model_name}")
            except Exception as e:
                print(f"✗ Failed to load {model_name}: {e}")
    
    def extract_advanced_features(self, audio_chunk, sr):
        """Extract additional features for robustness"""
        features = {}
        
        # Spectral features
        spectral_centroids = librosa.feature.spectral_centroid(y=audio_chunk, sr=sr)[0]
        features['spectral_centroid_mean'] = np.mean(spectral_centroids)
        features['spectral_centroid_std'] = np.std(spectral_centroids)
        
        # Zero crossing rate
        zcr = librosa.feature.zero_crossing_rate(audio_chunk)[0]
        features['zcr_mean'] = np.mean(zcr)
        features['zcr_std'] = np.std(zcr)
        
        # Energy features
        rms_energy = librosa.feature.rms(y=audio_chunk)[0]
        features['rms_mean'] = np.mean(rms_energy)
        features['rms_std'] = np.std(rms_energy)
        
        # Pitch features
        pitches, magnitudes = librosa.core.piptrack(y=audio_chunk, sr=sr, threshold=0.1)
        pitch_values = []
        for t in range(pitches.shape[1]):
            index = magnitudes[:, t].argmax()
            pitch = pitches[index, t]
            if pitch > 0:
                pitch_values.append(pitch)
        
        if pitch_values:
            features['pitch_mean'] = np.mean(pitch_values)
            features['pitch_std'] = np.std(pitch_values)
        else:
            features['pitch_mean'] = 0
            features['pitch_std'] = 0
        
        return features
    
    def is_speech_segment(self, audio_chunk, sr):
        """Improved speech detection"""
        # Energy-based detection
        rms = np.sqrt(np.mean(audio_chunk**2))
        if rms < 0.005:  # Very quiet
            return False
        
        # Zero crossing rate (speech typically has moderate ZCR)
        zcr = librosa.feature.zero_crossing_rate(audio_chunk)[0]
        zcr_mean = np.mean(zcr)
        if zcr_mean > 0.35:  # Too noisy/high-frequency
            return False
        
        # Spectral features
        spectral_centroids = librosa.feature.spectral_centroid(y=audio_chunk, sr=sr)[0]
        spec_centroid_mean = np.mean(spectral_centroids)
        if spec_centroid_mean < 500 or spec_centroid_mean > 8000:  # Outside typical speech range
            return False
        
        return True
    
    def predict_emotion(self, audio_chunk, sr):
        """Predict emotion using ensemble of models"""
        if not self.is_speech_segment(audio_chunk, sr):
            return None, 0.0
        
        predictions = {}
        confidences = {}
        
        # Get predictions from each model
        for model_name in self.models.keys():
            try:
                inputs = self.feature_extractors[model_name](
                    audio_chunk, 
                    sampling_rate=sr, 
                    return_tensors="pt", 
                    padding=True
                )
                
                with torch.no_grad():
                    logits = self.models[model_name](**inputs).logits
                
                probs = torch.softmax(logits.squeeze(0), dim=-1)
                top_prob, predicted_id = torch.max(probs, dim=-1)
                
                emotion = self.models[model_name].config.id2label[predicted_id.item()]
                predictions[model_name] = emotion
                confidences[model_name] = top_prob.item()
                
            except Exception as e:
                print(f"Error with {model_name}: {e}")
                continue
        
        if not predictions:
            return None, 0.0
        
        # Weighted ensemble voting
        emotion_scores = {}
        total_weight = 0
        
        for model_name, emotion in predictions.items():
            weight = self.model_weights[model_name] * confidences[model_name]
            if emotion not in emotion_scores:
                emotion_scores[emotion] = 0
            emotion_scores[emotion] += weight
            total_weight += weight
        
        if total_weight == 0:
            return None, 0.0
        
        # Normalize scores
        for emotion in emotion_scores:
            emotion_scores[emotion] /= total_weight
        
        # Get final prediction
        final_emotion = max(emotion_scores.items(), key=lambda x: x[1])
        return final_emotion[0], final_emotion[1]

def process_audio_advanced(audio_file):
    """Advanced audio processing with improved robustness"""
    
    # Initialize recognizer
    recognizer = MultiModelEmotionRecognizer()
    
    # Load audio with preprocessing
    waveform, rate = librosa.load(audio_file, sr=16000)
    
    # Audio preprocessing for robustness
    # Normalize audio
    waveform = librosa.util.normalize(waveform)
    
    # Remove silence from beginning and end
    waveform, _ = librosa.effects.trim(waveform, top_db=20)
    
    total_duration = len(waveform) / rate
    print(f"Processing audio: {total_duration:.2f} seconds")
    
    # Dynamic chunking based on voice activity
    chunk_duration = 1  # seconds
    overlap_duration = 0.5  # seconds overlap
    chunk_size = int(chunk_duration * rate)
    overlap_size = int(overlap_duration * rate)
    
    chunks = []
    chunk_times = []
    
    # Create overlapping chunks
    step_size = chunk_size - overlap_size
    num_chunks = math.ceil((len(waveform) - chunk_size) / step_size) + 1
    
    for i in range(num_chunks):
        start_sample = i * step_size
        end_sample = min(start_sample + chunk_size, len(waveform))
        
        if end_sample - start_sample < chunk_size // 2:  # Skip very short chunks
            break
            
        chunk = waveform[start_sample:end_sample]
        
        # Pad short chunks
        if len(chunk) < chunk_size:
            chunk = np.pad(chunk, (0, chunk_size - len(chunk)), mode='constant')
        
        chunks.append(chunk)
        chunk_times.append((start_sample / rate, min(end_sample / rate, total_duration)))
    
    # Emotion tracking with smoothing
    emotions_history = []
    confidences_history = []
    emotion_sequence = []
    
    # Process each chunk
    for i, (start_time, end_time, chunk) in enumerate(zip([t[0] for t in chunk_times], 
                                                         [t[1] for t in chunk_times], 
                                                         chunks)):
        
        emotion, confidence = recognizer.predict_emotion(chunk, rate)
        
        if emotion is None:
            print(f"{start_time:6.1f}-{end_time:6.1f}s: [SILENCE]")
            continue
        
        emotions_history.append(emotion)
        confidences_history.append(confidence)
        
        # Temporal smoothing - consider previous predictions
        if len(emotions_history) >= 3:
            # Use mode of last 3 predictions if confidence is low
            if confidence < 0.6:
                recent_emotions = emotions_history[-3:]
                emotion = stats.mode(recent_emotions)[0]
                confidence = np.mean(confidences_history[-3:])
        
        emotion_sequence.append({
            'start': start_time,
            'end': end_time,
            'emotion': emotion,
            'confidence': confidence
        })
        
        print(f"{start_time:6.1f}-{end_time:6.1f}s: {emotion:12} (conf: {confidence:.3f})")
    
    return emotion_sequence

# -----------------------------
# Usage
# -----------------------------
if __name__ == "__main__":
    audio_file = "/Users/madhusiddharthsuthagar/Downloads/input_9.mp3"
    
    print("Starting advanced emotion recognition...")
    results = process_audio_advanced(audio_file)
    
    print("\n" + "="*50)
    print("EMOTION TIMELINE SUMMARY")
    print("="*50)
    
    for result in results:
        print(f"{result['start']:6.1f}-{result['end']:6.1f}s: {result['emotion']:12} ({result['confidence']:.3f})")
    
    # Overall statistics
    if results:
        emotions = [r['emotion'] for r in results]
        confidences = [r['confidence'] for r in results]
        
        print(f"\nOverall Statistics:")
        print(f"Average confidence: {np.mean(confidences):.3f}")
        print(f"Dominant emotion: {stats.mode(emotions)[0]}")
        print(f"Emotion changes: {len(set(emotions))} different emotions detected")

Starting advanced emotion recognition...
Loading r-f/wav2vec-english-speech-emotion-recognition...


Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at r-f/wav2vec-english-speech-emotion-recognition and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✓ Loaded r-f/wav2vec-english-speech-emotion-recognition
Loading ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition...


Some weights of the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition were not used when initializing Wav2Vec2ForSequenceClassification: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.output.bias', 'classifier.output.weight']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition and are newly initialized: ['classifier.bias', 'classifier.weight', '

✓ Loaded ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition
Loading speechbrain/emotion-recognition-wav2vec2-IEMOCAP...
✗ Failed to load speechbrain/emotion-recognition-wav2vec2-IEMOCAP: Can't load feature extractor for 'speechbrain/emotion-recognition-wav2vec2-IEMOCAP'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'speechbrain/emotion-recognition-wav2vec2-IEMOCAP' is the correct path to a directory containing a preprocessor_config.json file
Processing audio: 4.38 seconds
   0.0-   1.0s: happy        (conf: 0.589)
   0.5-   1.5s: happy        (conf: 0.574)
   1.0-   2.0s: neutral      (conf: 1.000)
   1.5-   2.5s: neutral      (conf: 1.000)


DTypePromotionError: The DType <class 'numpy._FloatAbstractDType'> could not be promoted by <class 'numpy.dtypes.StrDType'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtypes.StrDType'>, <class 'numpy._FloatAbstractDType'>)

In [114]:
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
import torch
import librosa
import math
import numpy as np
from collections import Counter

# -----------------------------
# Load model and feature extractor
# -----------------------------
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
    "r-f/wav2vec-english-speech-emotion-recognition"
)
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    "r-f/wav2vec-english-speech-emotion-recognition"
)
model.eval()

# -----------------------------
# Load audio
# -----------------------------
audio_file = "/Users/madhusiddharthsuthagar/Downloads/input_8.mp3"
waveform, rate = librosa.load(audio_file, sr=16000)

# Normalize and trim silence
waveform = librosa.util.normalize(waveform)
waveform, _ = librosa.effects.trim(waveform, top_db=20)

total_duration = len(waveform) / rate

# -----------------------------
# Split into chunks with overlap
# -----------------------------
chunk_duration = 1  # seconds - longer for better context
overlap_duration = 0.5  # seconds overlap
chunk_size = int(chunk_duration * rate)
overlap_size = int(overlap_duration * rate)

chunks = []
chunk_times = []

# Create overlapping chunks
step_size = chunk_size - overlap_size
num_chunks = math.ceil((len(waveform) - chunk_size) / step_size) + 1

for i in range(num_chunks):
    start_sample = i * step_size
    end_sample = min(start_sample + chunk_size, len(waveform))
    
    if end_sample - start_sample < chunk_size // 2:
        break
    
    chunk = waveform[start_sample:end_sample]
    
    # Pad short chunks
    if len(chunk) < chunk_size:
        chunk = np.pad(chunk, (0, chunk_size - len(chunk)), mode='constant')
    
    # Better speech detection
    rms = np.sqrt(np.mean(chunk**2))
    zcr = np.mean(librosa.feature.zero_crossing_rate(chunk)[0])
    
    # Skip if too quiet or too noisy
    if rms > 0.008 and zcr < 0.3:
        chunks.append(chunk)
        chunk_times.append((start_sample / rate, min(end_sample / rate, total_duration)))

# -----------------------------
# Run inference for each chunk with smoothing
# -----------------------------
emotion_history = []
confidence_threshold = 0.5

for i, (start, end, chunk) in enumerate(zip([t[0] for t in chunk_times], [t[1] for t in chunk_times], chunks)):
    inputs = feature_extractor(chunk, sampling_rate=rate, return_tensors="pt", padding=True)
    
    with torch.no_grad():
        logits = model(**inputs).logits
    
    probs = torch.softmax(logits.squeeze(0), dim=-1)
    top_prob, predicted_id = torch.max(probs, dim=-1)
    emotion = model.config.id2label[predicted_id.item()]
    confidence = top_prob.item()
    
    # Temporal smoothing - use most common emotion from recent history if confidence is low
    if len(emotion_history) >= 3 and confidence < confidence_threshold:
        recent_emotions = emotion_history[-3:]
        emotion_counts = Counter(recent_emotions)
        smoothed_emotion = emotion_counts.most_common(1)[0][0]
        print(f"{int(start):3d}-{int(end):3d} sec: {smoothed_emotion:12} (smoothed, orig: {emotion}, conf: {confidence:.2f})")
        emotion = smoothed_emotion
    else:
        print(f"{int(start):3d}-{int(end):3d} sec: {emotion:12} (conf: {confidence:.2f})")
    
    emotion_history.append(emotion)

# Summary
if emotion_history:
    emotion_counts = Counter(emotion_history)
    print(f"\nSummary:")
    print(f"Most common emotion: {emotion_counts.most_common(1)[0][0]}")
    print(f"Emotion distribution: {dict(emotion_counts)}")

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at r-f/wav2vec-english-speech-emotion-recognition and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0-  1 sec: disgust      (conf: 0.15)
  0-  1 sec: disgust      (conf: 0.15)
  1-  2 sec: angry        (conf: 0.15)
  1-  2 sec: disgust      (smoothed, orig: angry, conf: 0.17)
  2-  3 sec: disgust      (smoothed, orig: angry, conf: 0.16)
  2-  3 sec: disgust      (smoothed, orig: angry, conf: 0.16)
  3-  4 sec: disgust      (smoothed, orig: angry, conf: 0.17)
  3-  4 sec: disgust      (smoothed, orig: angry, conf: 0.16)
  4-  5 sec: disgust      (smoothed, orig: disgust, conf: 0.15)
  4-  5 sec: disgust      (smoothed, orig: disgust, conf: 0.15)
  5-  6 sec: disgust      (smoothed, orig: disgust, conf: 0.15)
  5-  6 sec: disgust      (smoothed, orig: angry, conf: 0.15)
  6-  7 sec: disgust      (smoothed, orig: happy, conf: 0.15)
  6-  7 sec: disgust      (smoothed, orig: disgust, conf: 0.15)

Summary:
Most common emotion: disgust
Emotion distribution: {'disgust': 13, 'angry': 1}


In [135]:
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
import torch
import librosa
import math
import numpy as np
from collections import Counter, deque
import warnings
warnings.filterwarnings('ignore')

# -----------------------------
# Advanced Emotion Recognition with Better Temporal Logic
# -----------------------------

class AdaptiveEmotionRecognizer:
    def __init__(self, model_name="r-f/wav2vec-english-speech-emotion-recognition"):
        self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
        self.model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
        self.model.eval()
        
        # Adaptive parameters
        self.high_confidence_threshold = 0.7
        self.medium_confidence_threshold = 0.5
        self.low_confidence_threshold = 0.3
        
        # Temporal state
        self.recent_predictions = deque(maxlen=5)  # Keep last 5 predictions
        self.confidence_history = deque(maxlen=5)
        
        # Emotion transition probabilities (learned from data)
        # These help determine if an emotion change is realistic
        self.transition_weights = {
            'happy': {'happy': 1.0, 'neutral': 0.8, 'excited': 0.9, 'sad': 0.2, 'angry': 0.1},
            'sad': {'sad': 1.0, 'neutral': 0.7, 'happy': 0.3, 'angry': 0.4, 'fear': 0.6},
            'angry': {'angry': 1.0, 'neutral': 0.6, 'sad': 0.5, 'happy': 0.2, 'disgust': 0.7},
            'neutral': {'neutral': 1.0, 'happy': 0.8, 'sad': 0.6, 'angry': 0.5, 'fear': 0.4},
            'fear': {'fear': 1.0, 'sad': 0.7, 'neutral': 0.6, 'angry': 0.4, 'surprise': 0.8},
            'surprise': {'surprise': 1.0, 'happy': 0.8, 'neutral': 0.7, 'fear': 0.5, 'excited': 0.9},
            'disgust': {'disgust': 1.0, 'angry': 0.8, 'neutral': 0.6, 'sad': 0.5, 'fear': 0.4}
        }
    
    def is_valid_speech(self, audio_chunk, sr):
        """Enhanced speech validation"""
        # Energy check
        rms = np.sqrt(np.mean(audio_chunk**2))
        if rms < 0.005:
            return False, "too_quiet"
        
        # Zero crossing rate
        zcr = np.mean(librosa.feature.zero_crossing_rate(audio_chunk)[0])
        if zcr > 0.4:
            return False, "too_noisy"
        
        # Spectral features for speech detection
        try:
            mfccs = librosa.feature.mfcc(y=audio_chunk, sr=sr, n_mfcc=13)
            spectral_centroid = librosa.feature.spectral_centroid(y=audio_chunk, sr=sr)[0]
            
            # Speech typically has certain MFCC patterns
            mfcc_mean = np.mean(mfccs[1:4])  # Focus on formant frequencies
            spec_centroid_mean = np.mean(spectral_centroid)
            
            # Heuristic ranges for speech
            if spec_centroid_mean < 300 or spec_centroid_mean > 8000:
                return False, "non_speech_spectrum"
                
            if abs(mfcc_mean) > 50:  # Extreme MFCC values often indicate non-speech
                return False, "non_speech_mfcc"
                
        except:
            pass  # If feature extraction fails, continue with basic checks
        
        return True, "valid"
    
    def get_emotion_confidence_score(self, current_emotion, current_confidence, recent_context):
        """Calculate adjusted confidence based on temporal context"""
        
        if len(recent_context) == 0:
            return current_confidence
        
        # Get recent emotions and confidences
        recent_emotions = [pred['emotion'] for pred in recent_context]
        recent_confidences = [pred['confidence'] for pred in recent_context]
        
        # Base confidence adjustment
        adjusted_confidence = current_confidence
        
        # If current prediction is very different from recent trend, reduce confidence
        if len(recent_emotions) >= 2:
            # Count how many recent predictions match current
            matches = sum(1 for e in recent_emotions[-3:] if e == current_emotion)
            consistency_bonus = matches / min(3, len(recent_emotions))
            
            # Apply transition probability
            last_emotion = recent_emotions[-1]
            if last_emotion in self.transition_weights and current_emotion in self.transition_weights[last_emotion]:
                transition_prob = self.transition_weights[last_emotion][current_emotion]
                adjusted_confidence *= (0.5 + 0.5 * transition_prob)  # Scale between 0.5-1.0
            
            # Consistency adjustment
            adjusted_confidence = (adjusted_confidence + consistency_bonus * 0.3) / 1.3
        
        return min(adjusted_confidence, 1.0)
    
    def predict_with_context(self, audio_chunk, sr):
        """Predict emotion with temporal context awareness"""
        
        # Check if it's valid speech
        is_valid, reason = self.is_valid_speech(audio_chunk, sr)
        if not is_valid:
            return None, 0.0, reason
        
        # Get model prediction
        try:
            inputs = self.feature_extractor(audio_chunk, sampling_rate=sr, return_tensors="pt", padding=True)
            
            with torch.no_grad():
                logits = self.model(**inputs).logits
            
            probs = torch.softmax(logits.squeeze(0), dim=-1)
            
            # Get top 3 predictions for more nuanced decision making
            top3_probs, top3_ids = torch.topk(probs, 3)
            top3_emotions = [self.model.config.id2label[idx.item()] for idx in top3_ids]
            
            primary_emotion = top3_emotions[0]
            primary_confidence = top3_probs[0].item()
            
            # Adjust confidence based on context
            context_confidence = self.get_emotion_confidence_score(
                primary_emotion, 
                primary_confidence, 
                list(self.recent_predictions)
            )
            
            # If primary prediction has low confidence, consider alternatives
            if context_confidence < self.medium_confidence_threshold and len(top3_emotions) > 1:
                # Check if second choice is more consistent with recent context
                secondary_emotion = top3_emotions[1]
                secondary_confidence = top3_probs[1].item()
                
                if len(self.recent_predictions) >= 2:
                    recent_emotions = [pred['emotion'] for pred in list(self.recent_predictions)[-3:]]
                    secondary_matches = sum(1 for e in recent_emotions if e == secondary_emotion)
                    primary_matches = sum(1 for e in recent_emotions if e == primary_emotion)
                    
                    # If secondary emotion is much more consistent, use it
                    if secondary_matches > primary_matches and secondary_confidence > 0.3:
                        return secondary_emotion, secondary_confidence, "secondary_choice"
            
            return primary_emotion, context_confidence, "primary"
            
        except Exception as e:
            return None, 0.0, f"prediction_error: {e}"
    
    def process_audio(self, audio_file):
        """Process entire audio file with adaptive temporal modeling"""
        
        # Load and preprocess audio
        waveform, rate = librosa.load(audio_file, sr=16000)
        waveform = librosa.util.normalize(waveform)
        waveform, _ = librosa.effects.trim(waveform, top_db=20)
        
        total_duration = len(waveform) / rate
        print(f"Processing audio: {total_duration:.2f} seconds")
        
        # Adaptive chunking - smaller chunks for more granular detection
        chunk_duration = 1  # Shorter chunks
        overlap_duration = 0.5
        chunk_size = int(chunk_duration * rate)
        overlap_size = int(overlap_duration * rate)
        
        chunks = []
        chunk_times = []
        
        step_size = chunk_size - overlap_size
        num_chunks = math.ceil((len(waveform) - chunk_size) / step_size) + 1
        
        for i in range(num_chunks):
            start_sample = i * step_size
            end_sample = min(start_sample + chunk_size, len(waveform))
            
            if end_sample - start_sample < chunk_size // 3:  # More lenient minimum chunk size
                break
            
            chunk = waveform[start_sample:end_sample]
            
            if len(chunk) < chunk_size:
                chunk = np.pad(chunk, (0, chunk_size - len(chunk)), mode='constant')
            
            chunks.append(chunk)
            chunk_times.append((start_sample / rate, min(end_sample / rate, total_duration)))
        
        # Process chunks
        results = []
        
        for i, (start_time, end_time, chunk) in enumerate(zip([t[0] for t in chunk_times], 
                                                             [t[1] for t in chunk_times], 
                                                             chunks)):
            
            emotion, confidence, reason = self.predict_with_context(chunk, rate)
            
            if emotion is None:
                print(f"{start_time:6.1f}-{end_time:6.1f}s: [SKIP: {reason}]")
                continue
            
            # Add to recent predictions for context
            prediction = {
                'emotion': emotion,
                'confidence': confidence,
                'start': start_time,
                'end': end_time,
                'reason': reason
            }
            
            self.recent_predictions.append(prediction)
            self.confidence_history.append(confidence)
            
            # Display with context info
            context_info = ""
            if len(self.recent_predictions) > 1:
                recent_emotions = [p['emotion'] for p in list(self.recent_predictions)[-3:]]
                if len(set(recent_emotions)) == 1:
                    context_info = " [CONSISTENT]"
                else:
                    context_info = f" [CHANGE from {self.recent_predictions[-2]['emotion']}]"
            
            print(f"{start_time:6.1f}-{end_time:6.1f}s: {emotion:12} (conf: {confidence:.3f}){context_info}")
            
            results.append(prediction)
        
        return results

# -----------------------------
# Usage
# -----------------------------
def analyze_audio(audio_file):
    recognizer = AdaptiveEmotionRecognizer()
    results = recognizer.process_audio(audio_file)
    
    if not results:
        print("No valid speech segments detected.")
        return
    
    print("\n" + "="*60)
    print("TEMPORAL EMOTION ANALYSIS")
    print("="*60)
    
    # Segment the timeline into emotion phases
    emotion_phases = []
    current_phase = None
    
    for result in results:
        if current_phase is None or current_phase['emotion'] != result['emotion']:
            if current_phase is not None:
                emotion_phases.append(current_phase)
            
            current_phase = {
                'emotion': result['emotion'],
                'start': result['start'],
                'end': result['end'],
                'confidences': [result['confidence']],
                'duration': 0
            }
        else:
            current_phase['end'] = result['end']
            current_phase['confidences'].append(result['confidence'])
    
    if current_phase is not None:
        emotion_phases.append(current_phase)
    
    # Calculate durations and display phases
    print("\nEMOTION PHASES:")
    for phase in emotion_phases:
        phase['duration'] = phase['end'] - phase['start']
        avg_confidence = np.mean(phase['confidences'])
        print(f"{phase['start']:6.1f}-{phase['end']:6.1f}s ({phase['duration']:4.1f}s): {phase['emotion']:12} (avg conf: {avg_confidence:.3f})")
    
    # Overall statistics
    total_duration = results[-1]['end'] - results[0]['start']
    emotion_durations = {}
    
    for phase in emotion_phases:
        emotion = phase['emotion']
        if emotion not in emotion_durations:
            emotion_durations[emotion] = 0
        emotion_durations[emotion] += phase['duration']
    
    print(f"\nOVERALL EMOTION DISTRIBUTION:")
    for emotion, duration in sorted(emotion_durations.items(), key=lambda x: x[1], reverse=True):
        percentage = (duration / total_duration) * 100
        print(f"{emotion:12}: {duration:5.1f}s ({percentage:5.1f}%)")
    
    return results

# Usage
if __name__ == "__main__":
    audio_file = "/Users/madhusiddharthsuthagar/Downloads/input_8.mp3"
    results = analyze_audio(audio_file)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at r-f/wav2vec-english-speech-emotion-recognition and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing audio: 7.36 seconds
   0.0-   1.0s: sad          (conf: 0.154)
   0.5-   1.5s: sad          (conf: 0.153) [CONSISTENT]
   1.0-   2.0s: angry        (conf: 0.081) [CHANGE from sad]
   1.5-   2.5s: fear         (conf: 0.117) [CHANGE from angry]
   2.0-   3.0s: angry        (conf: 0.157) [CHANGE from fear]
   2.5-   3.5s: fear         (conf: 0.192) [CHANGE from angry]
   3.0-   4.0s: fear         (conf: 0.272) [CHANGE from fear]
   3.5-   4.5s: angry        (conf: 0.158) [CHANGE from fear]
   4.0-   5.0s: sad          (conf: 0.088) [CHANGE from angry]
   4.5-   5.5s: sad          (conf: 0.193) [CHANGE from sad]
   5.0-   6.0s: sad          (conf: 0.271) [CONSISTENT]
   5.5-   6.5s: angry        (conf: 0.079) [CHANGE from sad]
   6.0-   7.0s: sad          (conf: 0.243) [CHANGE from angry]
   6.5-   7.4s: sad          (conf: 0.272) [CHANGE from sad]

TEMPORAL EMOTION ANALYSIS

EMOTION PHASES:
   0.0-   1.5s ( 1.5s): sad          (avg conf: 0.153)
   1.0-   2.0s ( 1.0s): angry    

In [54]:
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
import torch
import librosa
import math
import numpy as np
from collections import Counter, deque
import warnings
warnings.filterwarnings('ignore')

class EnsembleEmotionRecognizer:
    def __init__(self, model_name="r-f/wav2vec-english-speech-emotion-recognition", num_runs=5):
        self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
        self.model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
        self.model.eval()
        self.num_runs = num_runs
        
        # Confidence thresholds
        self.high_confidence_threshold = 0.7
        self.medium_confidence_threshold = 0.5
    
    def is_valid_speech(self, audio_chunk, sr):
        """Enhanced speech validation"""
        rms = np.sqrt(np.mean(audio_chunk**2))
        if rms < 0.005:
            return False
        
        zcr = np.mean(librosa.feature.zero_crossing_rate(audio_chunk)[0])
        if zcr > 0.4:
            return False
        
        try:
            spectral_centroid = librosa.feature.spectral_centroid(y=audio_chunk, sr=sr)[0]
            spec_centroid_mean = np.mean(spectral_centroid)
            
            if spec_centroid_mean < 300 or spec_centroid_mean > 8000:
                return False
        except:
            pass
        
        return True
    
    def predict_single_chunk(self, audio_chunk, sr):
        """Single prediction for a chunk"""
        if not self.is_valid_speech(audio_chunk, sr):
            return None, 0.0
        
        try:
            inputs = self.feature_extractor(audio_chunk, sampling_rate=sr, return_tensors="pt", padding=True)
            
            with torch.no_grad():
                logits = self.model(**inputs).logits
            
            probs = torch.softmax(logits.squeeze(0), dim=-1)
            top_prob, predicted_id = torch.max(probs, dim=-1)
            
            emotion = self.model.config.id2label[predicted_id.item()]
            confidence = top_prob.item()
            
            return emotion, confidence
            
        except Exception as e:
            return None, 0.0
    
    def predict_chunk_ensemble(self, audio_chunk, sr):
        """Run multiple predictions and return majority vote"""
        predictions = []
        confidences = []
        
        # Run multiple predictions
        for _ in range(self.num_runs):
            emotion, confidence = self.predict_single_chunk(audio_chunk, sr)
            if emotion is not None:
                predictions.append(emotion)
                confidences.append(confidence)
        
        if not predictions:
            return None, 0.0
        
        # Majority voting
        emotion_counts = Counter(predictions)
        most_common = emotion_counts.most_common(1)[0]
        final_emotion = most_common[0]
        vote_count = most_common[1]
        
        # Calculate ensemble confidence
        # Weight by how many runs agreed and their average confidence
        emotion_indices = [i for i, e in enumerate(predictions) if e == final_emotion]
        emotion_confidences = [confidences[i] for i in emotion_indices]
        
        # Ensemble confidence = (vote_ratio * avg_confidence_of_votes)
        vote_ratio = vote_count / len(predictions)
        avg_confidence = np.mean(emotion_confidences)
        ensemble_confidence = vote_ratio * avg_confidence
        
        return final_emotion, ensemble_confidence
    
    def process_audio(self, audio_file):
        """Process audio with ensemble predictions"""
        print(f"Running ensemble emotion recognition ({self.num_runs} runs per chunk)...")
        
        # Load and preprocess audio
        waveform, rate = librosa.load(audio_file, sr=16000)
        waveform = librosa.util.normalize(waveform)
        waveform, _ = librosa.effects.trim(waveform, top_db=20)
        
        total_duration = len(waveform) / rate
        print(f"Processing audio: {total_duration:.2f} seconds")
        
        # Chunking parameters
        chunk_duration = 1.0
        overlap_duration = 0.5
        chunk_size = int(chunk_duration * rate)
        overlap_size = int(overlap_duration * rate)
        
        chunks = []
        chunk_times = []
        
        step_size = chunk_size - overlap_size
        num_chunks = math.ceil((len(waveform) - chunk_size) / step_size) + 1
        
        for i in range(num_chunks):
            start_sample = i * step_size
            end_sample = min(start_sample + chunk_size, len(waveform))
            
            if end_sample - start_sample < chunk_size // 3:
                break
            
            chunk = waveform[start_sample:end_sample]
            
            if len(chunk) < chunk_size:
                chunk = np.pad(chunk, (0, chunk_size - len(chunk)), mode='constant')
            
            chunks.append(chunk)
            chunk_times.append((start_sample / rate, min(end_sample / rate, total_duration)))
        
        # Process chunks with ensemble
        results = []
        total_chunks = len(chunks)
        
        for i, (start_time, end_time, chunk) in enumerate(zip([t[0] for t in chunk_times], 
                                                             [t[1] for t in chunk_times], 
                                                             chunks)):
            
            print(f"\rProcessing chunk {i+1}/{total_chunks}... ", end='', flush=True)
            
            emotion, confidence = self.predict_chunk_ensemble(chunk, rate)
            
            if emotion is None:
                continue
            
            result = {
                'emotion': emotion,
                'confidence': confidence,
                'start': start_time,
                'end': end_time
            }
            
            results.append(result)
        
        print("\nEnsemble processing complete!")
        return results

def analyze_audio_ensemble(audio_file, num_runs=5):
    """Main function to analyze audio with ensemble approach"""
    recognizer = EnsembleEmotionRecognizer(num_runs=num_runs)
    results = recognizer.process_audio(audio_file)
    
    if not results:
        print("No valid speech segments detected.")
        return None
    
    print("\n" + "="*60)
    print("ENSEMBLE EMOTION ANALYSIS RESULTS")
    print("="*60)
    
    # Group consecutive same emotions into phases
    emotion_phases = []
    current_phase = None
    
    for result in results:
        if current_phase is None or current_phase['emotion'] != result['emotion']:
            if current_phase is not None:
                emotion_phases.append(current_phase)
            
            current_phase = {
                'emotion': result['emotion'],
                'start': result['start'],
                'end': result['end'],
                'confidences': [result['confidence']],
            }
        else:
            current_phase['end'] = result['end']
            current_phase['confidences'].append(result['confidence'])
    
    if current_phase is not None:
        emotion_phases.append(current_phase)
    
    # Display emotion phases
    print(f"\nDetected {len(emotion_phases)} emotion phases:")
    print("-" * 60)
    
    for i, phase in enumerate(emotion_phases, 1):
        duration = phase['end'] - phase['start']
        avg_confidence = np.mean(phase['confidences'])
        
        print(f"Phase {i}: {phase['start']:6.1f}-{phase['end']:6.1f}s ({duration:4.1f}s)")
        print(f"         Emotion: {phase['emotion']:12} (avg confidence: {avg_confidence:.3f})")
        print()
    
    # Overall emotion distribution
    total_duration = results[-1]['end'] - results[0]['start']
    emotion_durations = {}
    
    for phase in emotion_phases:
        emotion = phase['emotion']
        duration = phase['end'] - phase['start']
        if emotion not in emotion_durations:
            emotion_durations[emotion] = 0
        emotion_durations[emotion] += duration
    
    print("OVERALL EMOTION DISTRIBUTION:")
    print("-" * 40)
    for emotion, duration in sorted(emotion_durations.items(), key=lambda x: x[1], reverse=True):
        percentage = (duration / total_duration) * 100
        print(f"{emotion:12}: {duration:5.1f}s ({percentage:5.1f}%)")
    
    # Summary statistics
    all_confidences = [r['confidence'] for r in results]
    print(f"\nSUMMARY STATISTICS:")
    print(f"Total analyzed duration: {total_duration:.1f}s")
    print(f"Number of emotion phases: {len(emotion_phases)}")
    print(f"Average ensemble confidence: {np.mean(all_confidences):.3f}")
    print(f"Most dominant emotion: {max(emotion_durations.items(), key=lambda x: x[1])[0]}")
    
    return {
        'phases': emotion_phases,
        'distribution': emotion_durations,
        'total_duration': total_duration,
        'avg_confidence': np.mean(all_confidences)
    }

# Usage
if __name__ == "__main__":
    audio_file = "/Users/madhusiddharthsuthagar/Downloads/input_1.wav"
    
    # Run ensemble analysis (5 predictions per chunk, majority vote)
    results = analyze_audio_ensemble(audio_file, num_runs=5)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at r-f/wav2vec-english-speech-emotion-recognition and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running ensemble emotion recognition (5 runs per chunk)...
Processing audio: 5.86 seconds
Processing chunk 11/11... 
Ensemble processing complete!

ENSEMBLE EMOTION ANALYSIS RESULTS

Detected 8 emotion phases:
------------------------------------------------------------
Phase 1:    0.0-   1.0s ( 1.0s)
         Emotion: angry        (avg confidence: 0.152)

Phase 2:    0.5-   1.5s ( 1.0s)
         Emotion: neutral      (avg confidence: 0.155)

Phase 3:    1.0-   2.0s ( 1.0s)
         Emotion: disgust      (avg confidence: 0.161)

Phase 4:    1.5-   3.0s ( 1.5s)
         Emotion: angry        (avg confidence: 0.156)

Phase 5:    2.5-   4.0s ( 1.5s)
         Emotion: disgust      (avg confidence: 0.159)

Phase 6:    3.5-   5.0s ( 1.5s)
         Emotion: happy        (avg confidence: 0.151)

Phase 7:    4.5-   5.5s ( 1.0s)
         Emotion: neutral      (avg confidence: 0.152)

Phase 8:    5.0-   5.9s ( 0.9s)
         Emotion: angry        (avg confidence: 0.155)

OVERALL EMOTION DISTRIBUT