In [1]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import soundfile as sf
import torch
from tqdm import tqdm
import kaldiio
import numpy as np

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")
model.config.output_hidden_states = True

target_layer = 1  

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

file_path = 'path_test.txt'
with open(file_path, 'r') as file:
    audio_files = [line.strip() for line in file]

speaker_ids = []
layer_features = []

for audio_file in tqdm(audio_files):
    speaker_id = audio_file.split('/')[-1].split('-')[0]  # adjust based on your file path format
    speaker_ids.append(speaker_id)
   
    audio_input, sample_rate = sf.read(audio_file)
   
    input_values = processor(audio_input, sampling_rate=sample_rate, return_tensors="pt").input_values.to(device)
   
    with torch.no_grad():
        outputs = model(input_values)
   
    all_hidden_states = outputs.hidden_states
    layer_feature = all_hidden_states[target_layer].squeeze(0).cpu().numpy()

    pooled_feature = np.mean(layer_feature, axis=0)
   
    layer_features.append(pooled_feature)


np.save('pooled_features.npy', np.array(layer_features))
np.save('speaker_ids.npy', np.array(speaker_ids))


Some weights of the model checkpoint at facebook/wav2vec2-large-960h-lv60-self were not used when initializing Wav2Vec2Model: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████████████████████████████████████████████████████████████████████████████| 5/5 [07:03<00:00, 84.74s/it]


In [2]:
print(f"Feature size: {layer_features[0].shape}")

for idx, feature in enumerate(layer_features):
    print(f"Feature size for sample {idx}: {feature.shape}")


Feature size: (1024,)
Feature size for sample 0: (1024,)
Feature size for sample 1: (1024,)
Feature size for sample 2: (1024,)
Feature size for sample 3: (1024,)
Feature size for sample 4: (1024,)


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

train_features = np.load('train_features.npy')
train_labels = np.load('train_labels.npy')
test_features = np.load('test_features.npy')
test_labels = np.load('test_labels.npy')

label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
test_labels_encoded = label_encoder.transform(test_labels)


In [None]:
from sklearn.svm import SVC

svm_model = SVC(kernel='rbf', C=10.0, gamma='scale', random_state=42)

svm_model.fit(train_features, train_labels_encoded)

svm_predictions = svm_model.predict(test_features)

svm_accuracy = accuracy_score(test_labels_encoded, svm_predictions)
print(f"SVM Accuracy: {svm_accuracy:.4f}")
print("SVM Classification Report:")
print(classification_report(test_labels_encoded, svm_predictions, target_names=label_encoder.classes_))


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout

cnn_model = Sequential([
    Conv1D(256, kernel_size=5, activation='relu', input_shape=(train_features_cnn.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Conv1D(512, kernel_size=5, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(512, activation='relu'),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])

cnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

cnn_model.fit(train_features_cnn, train_labels_encoded, epochs=15, batch_size=32, validation_split=0.2)

cnn_test_loss, cnn_test_accuracy = cnn_model.evaluate(test_features_cnn, test_labels_encoded)
print(f"CNN Test Accuracy: {cnn_test_accuracy:.4f}")


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Define the updated ANN model with more complexity
ann_model = Sequential([
    Dense(512, activation='relu', input_shape=(train_features.shape[1],)),
    Dropout(0.5),
    Dense(512, activation='relu'),
    Dropout(0.5),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])

ann_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

ann_model.fit(train_features, train_labels_encoded, epochs=15, batch_size=32, validation_split=0.2)

ann_test_loss, ann_test_accuracy = ann_model.evaluate(test_features, test_labels_encoded)
print(f"ANN Test Accuracy: {ann_test_accuracy:.4f}")
