In [1]:
from speechbrain.inference.separation import SepformerSeparation as separator
import torchaudio

model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr')

# for custom file, change path
est_sources = model.separate_file(path='/Users/admin/Downloads/vox_mixtures/train/mix_0000/mixture.wav') 

torchaudio.save("source1hat.wav", est_sources[:, :, 0].detach().cpu(), 8000)
torchaudio.save("source2hat.wav", est_sources[:, :, 1].detach().cpu(), 8000)


  from .autonotebook import tqdm as notebook_tqdm
INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [allow_tf32, disable_jit_profiling]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []
INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/sepformer-whamr' if not cached
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/sepformer-whamr' if not cached
INFO:speechbrain.utils.fetching:Fetch masknet.ckpt: Fetching from HuggingFace Hub 'speechbrain/sepformer-whamr' if not cached
INFO:speechbrain.utils.fetching:Fetch encoder.ckpt: Fetching from HuggingFace Hub 'speechbrain/sepformer-whamr' if not cached
INFO:speechbrain.utils.fetching:Fetch decoder.ckpt: Fetching from HuggingFace Hub 'speechbrain/sepformer-whamr' if not cached
INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: masknet, e

Resampling the audio from 16000 Hz to 8000 Hz


In [2]:
from speechbrain.inference.separation import SepformerSeparation as separator
import torchaudio

model = separator.from_hparams(source="speechbrain/sepformer-wham-enhancement", savedir='pretrained_models/sepformer-wham-enhancement')

# for custom file, change path
est_sources = model.separate_file(path='source1hat.wav') 

torchaudio.save("enhanced_wham.wav", est_sources[:, :, 0].detach().cpu(), 8000)


INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/sepformer-wham-enhancement' if not cached
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/sepformer-wham-enhancement' if not cached
INFO:speechbrain.utils.fetching:Fetch encoder.ckpt: Fetching from HuggingFace Hub 'speechbrain/sepformer-wham-enhancement' if not cached
INFO:speechbrain.utils.fetching:Fetch masknet.ckpt: Fetching from HuggingFace Hub 'speechbrain/sepformer-wham-enhancement' if not cached
INFO:speechbrain.utils.fetching:Fetch decoder.ckpt: Fetching from HuggingFace Hub 'speechbrain/sepformer-wham-enhancement' if not cached
INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: encoder, masknet, decoder


In [6]:
import torch
import torch.nn as nn


class LSTMClassifier(nn.Module):
    """
    A PyTorch-based LSTM classifier for sequence data.

    Attributes:
        input_size (int): The number of features in the input sequence.
        hidden_size (int): The number of features in the hidden state of the LSTM.
        num_layers (int): The number of stacked LSTM layers.
        num_classes (int): The number of output classes for classification.
    """

    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        """
        Initializes the LSTMClassifier.

        Args:
            input_size (int): Number of features in the input sequence.
            hidden_size (int): Number of features in the hidden state.
            num_layers (int): Number of stacked LSTM layers.
            num_classes (int): Number of output classes.
        """
        super(LSTMClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # Define an LSTM layer
        # input_size: number of input features per time step
        # hidden_size: number of features in the hidden state
        # num_layers: number of stacked LSTM layers
        # batch_first=True means input/output tensors have shape (batch_size, seq_length, input_size)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)

        # Fully connected layer 1
        # Takes the hidden state from the LSTM and projects it to 1024 dimensions
        self.fc1 = nn.Sequential(
            nn.Linear(hidden_size, 1024),  # Fully connected layer
            nn.ReLU()  # Activation function
        )

        # Fully connected layer 2
        # Maps the 1024-dimensional output to the number of classes
        # Softmax activation ensures outputs represent probabilities
        self.fc2 = nn.Sequential(
            nn.Linear(1024, num_classes),  # Fully connected layer
            nn.Softmax(dim=1)  # Apply Softmax along the class dimension
        )

    def forward(self, x):
        """
        Defines the forward pass of the LSTMClassifier.

        Args:
            x (Tensor): Input tensor of shape (batch_size, seq_length, input_size).

        Returns:
            Tensor: Output tensor of shape (batch_size, num_classes), representing class probabilities.
        """
        # Initialize hidden and cell states with zeros
        # Shape of h0 and c0: (num_layers, batch_size, hidden_size)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)  # Hidden state
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)  # Cell state

        # Pass the input sequence through the LSTM
        # out: output features from all time steps (batch_size, seq_length, hidden_size)
        # _: hidden and cell states from the last time step
        out, _ = self.lstm(x, (h0, c0))

        # Use only the output from the last time step
        # Shape of out after slicing: (batch_size, hidden_size)
        out = out[:, -1, :]

        # Pass the output through the first fully connected layer and activation
        out = self.fc1(out)

        # Pass the result through the second fully connected layer and apply Softmax
        out = self.fc2(out)

        return out

In [26]:
import os
import torch
import torchaudio
from IPython.display import Audio, display
import librosa
import pickle
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from sklearn.preprocessing import StandardScaler,OneHotEncoder

from speechbrain.inference.separation import SepformerSeparation as separator

# -------- DEVICE SETUP --------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def zcr(data,frame_length,hop_length):
    zcr=librosa.feature.zero_crossing_rate(data,frame_length=frame_length,hop_length=hop_length)
    return np.squeeze(zcr)
def rmse(data,frame_length=2048,hop_length=512):
    # rmse=librosa.feature.rms(data,frame_length=frame_length,hop_length=hop_length)
    rmse = librosa.feature.rms(y=data, frame_length=frame_length, hop_length=hop_length)

    return np.squeeze(rmse)
def mfcc(data,sr,frame_length=2048,hop_length=512,flatten:bool=True):
    # mfcc=librosa.feature.mfcc(data,sr=sr)
    mfcc = librosa.feature.mfcc(y=data, sr=sr)

    return np.squeeze(mfcc.T)if not flatten else np.ravel(mfcc.T)

def extract_features(data,sr=22050,frame_length=2048,hop_length=512):
    result=np.array([])
    
    result=np.hstack((result,
                      zcr(data,frame_length,hop_length),
                      rmse(data,frame_length,hop_length),
                      mfcc(data,sr,frame_length,hop_length)
                     ))
    return result
# -------- LOAD GENDER MODEL (PyTorch) --------
gender_model = torch.load("sound_model.pth", map_location=device, weights_only=False)
gender_model.to(device)
gender_model.eval()

# -------- LOAD EMOTION MODEL (TensorFlow) --------
emotion_model = load_model('New_emotion_model.h5')
def get_predict_feat(path):
    d, s_rate= librosa.load(path, duration=2.5, offset=0.6)
    res=extract_features(d)
    result=np.array(res)
    result=np.reshape(result,newshape=(1,2376))
    i_result = scaler2.transform(result)
    final_result=np.expand_dims(i_result, axis=2)
    
    return final_result


scaler = StandardScaler()
encoder = OneHotEncoder()
# Saving scaler
with open('scaler2.pickle', 'wb') as f:
    pickle.dump(scaler, f)

# Loading scaler
with open('scaler2.pickle', 'rb') as f:
    scaler2 = pickle.load(f)

# Saving encoder
with open('encoder2.pickle', 'wb') as f:
    pickle.dump(encoder, f)

# Loading encoder
with open('encoder2.pickle', 'rb') as f:
    encoder2 = pickle.load(f)

    
print("Done")  
emotions1={1:'Neutral', 2:'Calm', 3:'Happy', 4:'Sad', 5:'Angry', 6:'Fear', 7:'Disgust',8:'Surprise'}
def predict_emotion(path1):
    res=get_predict_feat(path1)
    predictions=model.predict(res)
    y_pred = encoder2.inverse_transform(predictions)
    print(y_pred[0][0])
# -------- HELPER FUNCTIONS --------

def preprocess_audio(file_path, sample_rate=16000, n_mfcc=20, max_frames=62):
    audio, sr = librosa.load(file_path, sr=sample_rate)
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    
    if mfcc.shape[1] > max_frames:
        mfcc = mfcc[:, :max_frames]
    else:
        padding = max_frames - mfcc.shape[1]
        mfcc = np.pad(mfcc, ((0, 0), (0, padding)), mode="constant")

    mfcc_tensor = torch.tensor(mfcc, dtype=torch.float32).T.unsqueeze(0).to(device)
    return mfcc_tensor

def predict_gender(file_path):
    input_tensor = preprocess_audio(file_path)
    with torch.no_grad():
        output = gender_model(input_tensor)
        prediction = torch.argmax(output, dim=1).item()
        return "Female" if prediction == 0 else "Male"

def load_and_prepare_audio(file_path):
    audio, sample_rate = librosa.load(file_path, sr=None)
    target_sr = 16000
    if sample_rate != target_sr:
        audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=target_sr)
    if len(audio) < INPUT_LENGTH:
        pad_width = INPUT_LENGTH - len(audio)
        audio = np.pad(audio, (0, pad_width), mode='constant')
    else:
        audio = audio[:INPUT_LENGTH]
    audio = np.expand_dims(audio, axis=0)
    audio = np.expand_dims(audio, axis=-1)
    return audio

def predict_emotion(file_path):
    audio_input = load_and_prepare_audio(file_path)
    prediction = emotion_model.predict(audio_input)
    predicted_label = np.argmax(prediction)
    return label_map.get(predicted_label, 'Unknown')

# -------- LOAD MODELS --------
sep_model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr')
enh_model = separator.from_hparams(source="speechbrain/sepformer-wham-enhancement", savedir='pretrained_models/sepformer-wham-enhancement')

# -------- PROCESS AUDIO --------
input_mix = '/Users/admin/Downloads/vox_mixtures/train/mix_0000/mixture.wav'
print('🎧 Input Mixed Audio Sample:\n')
display(Audio(input_mix))
IPython.display.Audio(input_mix)
est_sources = sep_model.separate_file(path=input_mix)
os.makedirs("temp_sep", exist_ok=True)
sep_paths = []

for i in range(est_sources.shape[2]):
    path = f"temp_sep/source_{i+1}.wav"
    torchaudio.save(path, est_sources[:, :, i].detach().cpu(), 8000)
    print(f'🎧 Speaker {i+1} Audio:\n')
    display(Audio(f"temp_sep/source_{i+1}.wav"))
    sep_paths.append(path)

# -------- ENHANCE, DETECT GENDER & EMOTION --------
os.makedirs("enhanced_outputs", exist_ok=True)
for i, src_path in enumerate(sep_paths):
    enhanced_sources = enh_model.separate_file(path=src_path)
    enhanced_path = f"enhanced_outputs/enhanced_source{i+1}.wav"
    torchaudio.save(enhanced_path, enhanced_sources[:, :, 0].detach().cpu(), 8000)
    
    gender = predict_gender(enhanced_path)
    emotion = predict_emotion(enhanced_path)

    print(f"🎤 Speaker {i+1}: Gender = {gender}, Emotion = {emotion}")


INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/sepformer-whamr' if not cached


Done


INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/sepformer-whamr' if not cached
INFO:speechbrain.utils.fetching:Fetch masknet.ckpt: Fetching from HuggingFace Hub 'speechbrain/sepformer-whamr' if not cached
INFO:speechbrain.utils.fetching:Fetch encoder.ckpt: Fetching from HuggingFace Hub 'speechbrain/sepformer-whamr' if not cached
INFO:speechbrain.utils.fetching:Fetch decoder.ckpt: Fetching from HuggingFace Hub 'speechbrain/sepformer-whamr' if not cached
INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: masknet, encoder, decoder
INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/sepformer-wham-enhancement' if not cached
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/sepformer-wham-enhancement' if not cached
INFO:speechbrain.utils.fetching:Fetch encoder.ckpt: Fetching from HuggingFace Hub 'speechbrain/sepformer-wham-enhancement' if 

🎧 Input Mixed Audio Sample:



Resampling the audio from 16000 Hz to 8000 Hz
🎧 Speaker 1 Audio:



🎧 Speaker 2 Audio:



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 146ms/step
🎤 Speaker 1: Gender = Male, Emotion = angry
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
🎤 Speaker 2: Gender = Male, Emotion = angry


In [None]:
############Improve Code

In [31]:
import os
import torch
import torchaudio
import librosa
import pickle
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from speechbrain.inference.separation import SepformerSeparation as separator
from IPython.display import Audio, display

# -------- DEVICE SETUP --------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# -------- HELPER FUNCTIONS FOR FEATURE EXTRACTION --------
def zcr(data, frame_length, hop_length):
    return np.squeeze(librosa.feature.zero_crossing_rate(data, frame_length=frame_length, hop_length=hop_length))

def rmse(data, frame_length=2048, hop_length=512):
    return np.squeeze(librosa.feature.rms(y=data, frame_length=frame_length, hop_length=hop_length))

def mfcc(data, sr, frame_length=2048, hop_length=512, flatten=True):
    mfcc_feat = librosa.feature.mfcc(y=data, sr=sr)
    return np.squeeze(mfcc_feat.T) if not flatten else np.ravel(mfcc_feat.T)

def extract_features(data, sr=22050, frame_length=2048, hop_length=512):
    return np.hstack((
        zcr(data, frame_length, hop_length),
        rmse(data, frame_length, hop_length),
        mfcc(data, sr, frame_length, hop_length)
    ))

# -------- LOAD MODELS --------
gender_model = torch.load("sound_model.pth", map_location=device, weights_only=False)
gender_model.to(device)
gender_model.eval()

emotion_model = load_model('New_emotion_model.h5')

with open('scaler2.pickle', 'rb') as f:
    scaler2 = pickle.load(f)

with open('encoder2.pickle', 'rb') as f:
    encoder2 = pickle.load(f)

emotions1 = {1: 'Neutral', 2: 'Calm', 3: 'Happy', 4: 'Sad', 5: 'Angry', 6: 'Fear', 7: 'Disgust', 8: 'Surprise'}

# -------- FEATURE PREP FOR EMOTION --------
def get_predict_feat(path):
    d, s_rate = librosa.load(path, duration=2.5, offset=0.6)
    res = extract_features(d)
    result = np.reshape(np.array(res), (1, 2376))
    i_result = scaler2.transform(result)
    return np.expand_dims(i_result, axis=2)

# -------- EMOTION PREDICTION --------
def predict_emotion(path):
    feat = get_predict_feat(path)
    predictions = emotion_model.predict(feat)
    predicted_index = np.argmax(predictions, axis=1)[0]
    emotion_label = emotions1.get(predicted_index + 1, "Unknown")  # +1 because emotions1 is 1-indexed
    return emotion_label

# -------- GENDER PREDICTION --------
def preprocess_audio(file_path, sample_rate=16000, n_mfcc=20, max_frames=62):
    audio, sr = librosa.load(file_path, sr=sample_rate)
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    if mfcc.shape[1] < max_frames:
        mfcc = np.pad(mfcc, ((0, 0), (0, max_frames - mfcc.shape[1])), mode="constant")
    else:
        mfcc = mfcc[:, :max_frames]
    return torch.tensor(mfcc, dtype=torch.float32).T.unsqueeze(0).to(device)

def predict_gender(file_path):
    input_tensor = preprocess_audio(file_path)
    with torch.no_grad():
        output = gender_model(input_tensor)
        prediction = torch.argmax(output, dim=1).item()
        return "Female" if prediction == 0 else "Male"

# -------- LOAD SEPARATION & ENHANCEMENT MODELS --------
sep_model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr')
enh_model = separator.from_hparams(source="speechbrain/sepformer-wham-enhancement", savedir='pretrained_models/sepformer-wham-enhancement')

# -------- PROCESS AUDIO FILE --------
def process_mixture(mix_path):
    print('🎧 Input Mixed Audio Sample:\n')
    display(Audio(mix_path))

    est_sources = sep_model.separate_file(path=mix_path)
    os.makedirs("temp_sep", exist_ok=True)
    sep_paths = []

    for i in range(est_sources.shape[2]):
        sep_path = f"temp_sep/source_{i+1}.wav"
        torchaudio.save(sep_path, est_sources[:, :, i].detach().cpu(), 8000)
        print(f'🎧 Speaker {i+1} Audio:\n')
        display(Audio(sep_path))
        sep_paths.append(sep_path)
    
    return sep_paths

def enhance_and_predict(paths):
    os.makedirs("enhanced_outputs", exist_ok=True)
    for i, src_path in enumerate(paths):
        enhanced_sources = enh_model.separate_file(path=src_path)
        enhanced_path = f"enhanced_outputs/enhanced_source{i+1}.wav"
        torchaudio.save(enhanced_path, enhanced_sources[:, :, 0].detach().cpu(), 8000)

        gender = predict_gender(enhanced_path)
        emotion = predict_emotion(enhanced_path)

        print(f"🎤 Speaker {i+1}: Gender = {gender}, Emotion = {emotion}")

# -------- RUN THE PIPELINE --------
input_mix = '/Users/admin/Downloads/vox_mixtures/train/mix_0000/mixture.wav'
sep_paths = process_mixture(input_mix)
enhance_and_predict(sep_paths)


INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/sepformer-whamr' if not cached
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/sepformer-whamr' if not cached
INFO:speechbrain.utils.fetching:Fetch masknet.ckpt: Fetching from HuggingFace Hub 'speechbrain/sepformer-whamr' if not cached
INFO:speechbrain.utils.fetching:Fetch encoder.ckpt: Fetching from HuggingFace Hub 'speechbrain/sepformer-whamr' if not cached
INFO:speechbrain.utils.fetching:Fetch decoder.ckpt: Fetching from HuggingFace Hub 'speechbrain/sepformer-whamr' if not cached
INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: masknet, encoder, decoder
INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/sepformer-wham-enhancement' if not cached
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/sepformer-wham-enhancement' if not cac

🎧 Input Mixed Audio Sample:



Resampling the audio from 16000 Hz to 8000 Hz
🎧 Speaker 1 Audio:



🎧 Speaker 2 Audio:



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step
🎤 Speaker 1: Gender = Male, Emotion = Fear
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
🎤 Speaker 2: Gender = Male, Emotion = Happy


In [37]:
import os
import torch
import torchaudio
import librosa
import pickle
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from speechbrain.inference.separation import SepformerSeparation as separator
from speechbrain.pretrained import SpeakerRecognition
from IPython.display import Audio, display
import glob

# -------- DEVICE SETUP --------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# -------- HELPER FUNCTIONS FOR FEATURE EXTRACTION --------
def zcr(data, frame_length, hop_length):
    return np.squeeze(librosa.feature.zero_crossing_rate(data, frame_length=frame_length, hop_length=hop_length))

def rmse(data, frame_length=2048, hop_length=512):
    return np.squeeze(librosa.feature.rms(y=data, frame_length=frame_length, hop_length=hop_length))

def mfcc(data, sr, frame_length=2048, hop_length=512, flatten=True):
    mfcc_feat = librosa.feature.mfcc(y=data, sr=sr)
    return np.squeeze(mfcc_feat.T) if not flatten else np.ravel(mfcc_feat.T)

def extract_features(data, sr=22050, frame_length=2048, hop_length=512):
    return np.hstack((
        zcr(data, frame_length, hop_length),
        rmse(data, frame_length, hop_length),
        mfcc(data, sr, frame_length, hop_length)
    ))

# -------- LOAD MODELS --------
gender_model = torch.load("sound_model.pth", map_location=device, weights_only=False)
gender_model.to(device)
gender_model.eval()

emotion_model = load_model('New_emotion_model.h5')

with open('scaler2.pickle', 'rb') as f:
    scaler2 = pickle.load(f)

with open('encoder2.pickle', 'rb') as f:
    encoder2 = pickle.load(f)

emotions1 = {1: 'Neutral', 2: 'Calm', 3: 'Happy', 4: 'Sad', 5: 'Angry', 6: 'Fear', 7: 'Disgust', 8: 'Surprise'}

# -------- FEATURE PREP FOR EMOTION --------
def get_predict_feat(path):
    d, s_rate = librosa.load(path, duration=2.5, offset=0.6)
    res = extract_features(d)
    result = np.reshape(np.array(res), (1, 2376))
    i_result = scaler2.transform(result)
    return np.expand_dims(i_result, axis=2)

# -------- EMOTION PREDICTION --------
def predict_emotion(path):
    feat = get_predict_feat(path)
    predictions = emotion_model.predict(feat)
    predicted_index = np.argmax(predictions, axis=1)[0]
    emotion_label = emotions1.get(predicted_index + 1, "Unknown")  # +1 because emotions1 is 1-indexed
    return emotion_label

# -------- GENDER PREDICTION --------
def preprocess_audio(file_path, sample_rate=16000, n_mfcc=20, max_frames=62):
    audio, sr = librosa.load(file_path, sr=sample_rate)
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    if mfcc.shape[1] < max_frames:
        mfcc = np.pad(mfcc, ((0, 0), (0, max_frames - mfcc.shape[1])), mode="constant")
    else:
        mfcc = mfcc[:, :max_frames]
    return torch.tensor(mfcc, dtype=torch.float32).T.unsqueeze(0).to(device)

def predict_gender(file_path):
    input_tensor = preprocess_audio(file_path)
    with torch.no_grad():
        output = gender_model(input_tensor)
        prediction = torch.argmax(output, dim=1).item()
        return "Female" if prediction == 0 else "Male"

# -------- SPEAKER MATCHING --------
speaker_verifier = SpeakerRecognition.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb",
    savedir="pretrained_models/spkrec-ecapa-voxceleb"
)

def get_matching_speaker(source_path, reference_folder, threshold=0.20):
    best_match = None
    best_score = -1
    for ref_file in glob.glob(os.path.join(reference_folder, "*.wav")):
        score, _ = speaker_verifier.verify_files(ref_file, source_path)
        print(f"🔎 Comparing with {os.path.basename(ref_file)} — Score: {score.item():.4f}")
        if score.item() > best_score:
            best_score = score.item()
            best_match = os.path.basename(ref_file)
    
    if best_score > threshold:
        return f"✅ Matched with {best_match} (Score: {best_score:.4f})"
    else:
        return f"❌ No match found (Highest Score: {best_score:.4f})"

# -------- LOAD SEPARATION & ENHANCEMENT MODELS --------
sep_model = separator.from_hparams(source="speechbrain/sepformer-whamr", savedir='pretrained_models/sepformer-whamr')
enh_model = separator.from_hparams(source="speechbrain/sepformer-wham-enhancement", savedir='pretrained_models/sepformer-wham-enhancement')

# -------- PROCESS AUDIO FILE --------
def process_mixture(mix_path):
    print('🎧 Input Mixed Audio Sample:\n')
    display(Audio(mix_path))

    est_sources = sep_model.separate_file(path=mix_path)
    os.makedirs("temp_sep", exist_ok=True)
    sep_paths = []

    for i in range(est_sources.shape[2]):
        sep_path = f"temp_sep/source_{i+1}.wav"
        torchaudio.save(sep_path, est_sources[:, :, i].detach().cpu(), 8000)
        print(f'🎧 Speaker {i+1} Audio:\n')
        display(Audio(sep_path))
        sep_paths.append(sep_path)
    
    return sep_paths

# -------- ENHANCE, PREDICT, MATCH --------
def enhance_and_predict(paths, reference_folder="reference_speakers"):
    os.makedirs("enhanced_outputs", exist_ok=True)
    for i, src_path in enumerate(paths):
        enhanced_sources = enh_model.separate_file(path=src_path)
        enhanced_path = f"enhanced_outputs/enhanced_source{i+1}.wav"
        torchaudio.save(enhanced_path, enhanced_sources[:, :, 0].detach().cpu(), 8000)

        gender = predict_gender(enhanced_path)
        emotion = predict_emotion(enhanced_path)
        match_result = get_matching_speaker(enhanced_path, reference_folder)

        print(f"\n🎤 Speaker {i+1} Analysis:")
        print(f"   - Gender: {gender}")
        print(f"   - Emotion: {emotion}")
        print(f"   - Match: {match_result}\n")

# -------- RUN THE PIPELINE --------
input_mix = '/Users/admin/Downloads/vox_mixtures/train/mix_0000/mixture.wav'
reference_speaker_folder = 'reference_speakers'  # <-- Update this path if needed

sep_paths = process_mixture(input_mix)
enhance_and_predict(sep_paths, reference_folder=reference_speaker_folder)


INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch label_encoder.txt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: embedding_model, mean_var_norm_emb, classifier, label_encoder
INFO:speechbrain.utils.fetching:Fetch hyperpara

🎧 Input Mixed Audio Sample:



Resampling the audio from 16000 Hz to 8000 Hz
🎧 Speaker 1 Audio:



🎧 Speaker 2 Audio:



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 164ms/step
🔎 Comparing with Audio_4.wav — Score: 0.0702
🔎 Comparing with Audio_2.wav — Score: 0.3178
🔎 Comparing with Audio_3.wav — Score: 0.0267
🔎 Comparing with Audio_1.wav — Score: 0.3059

🎤 Speaker 1 Analysis:
   - Gender: Male
   - Emotion: Fear
   - Match: ✅ Matched with Audio_2.wav (Score: 0.3178)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
🔎 Comparing with Audio_4.wav — Score: 0.1229
🔎 Comparing with Audio_2.wav — Score: 0.2991
🔎 Comparing with Audio_3.wav — Score: 0.0830
🔎 Comparing with Audio_1.wav — Score: 0.2875

🎤 Speaker 2 Analysis:
   - Gender: Male
   - Emotion: Happy
   - Match: ✅ Matched with Audio_2.wav (Score: 0.2991)

