In [27]:
from transformers import Wav2Vec2Processor, HubertModel
import soundfile as sf
import torch
from tqdm import tqdm
import numpy as np

# Load the processor and model for Hubert Large
processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
model.config.output_hidden_states = True

target_layer = 3
device = torch.device("cpu")
model.to(device)

file_path = 'path_train.txt'
with open(file_path, 'r') as file:
    audio_files = [line.strip() for line in file]

speaker_ids = []
layer_features = []

for audio_file in tqdm(audio_files):
    speaker_id = audio_file.split('/')[-1].split('-')[0]  # Adjust based on your file path format
    speaker_ids.append(speaker_id)
    
    audio_input, sample_rate = sf.read(audio_file)
    
    input_values = processor(audio_input, sampling_rate=sample_rate, return_tensors="pt").input_values.to(device)
    
    with torch.no_grad():
        outputs = model(input_values)
    
    all_hidden_states = outputs.hidden_states
    layer_feature = all_hidden_states[target_layer].squeeze(0).cpu().numpy()
    
    pooled_feature = np.mean(layer_feature, axis=0)
    
    layer_features.append(pooled_feature)

layer_features = np.array(layer_features)
speaker_ids = np.array(speaker_ids)
# test
np.savez(f'chunk/age/hubert_3s_train_{target_layer}.npz', features=layer_features, speaker_ids=speaker_ids)


100%|██████████| 856/856 [04:49<00:00,  2.96it/s]


In [28]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import soundfile as sf
import torch
from tqdm import tqdm
import numpy as np

# Load the processor and model for Wav2Vec2
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-960h")
model.config.output_hidden_states = True

target_layer = 3
device = torch.device("cpu")
model.to(device)

file_path = 'path_test.txt'
with open(file_path, 'r') as file:
    audio_files = [line.strip() for line in file]

speaker_ids = []
layer_features = []

for audio_file in tqdm(audio_files):
    speaker_id = audio_file.split('/')[-1].split('-')[0]  # Adjust based on your file path format
    speaker_ids.append(speaker_id)
    
    audio_input, sample_rate = sf.read(audio_file)
    
    input_values = processor(audio_input, sampling_rate=sample_rate, return_tensors="pt").input_values.to(device)
    
    with torch.no_grad():
        outputs = model(input_values)
    
    all_hidden_states = outputs.hidden_states
    layer_feature = all_hidden_states[target_layer].squeeze(0).cpu().numpy()
    
    pooled_feature = np.mean(layer_feature, axis=0)
    
    layer_features.append(pooled_feature)

layer_features = np.array(layer_features)
speaker_ids = np.array(speaker_ids)

np.savez(f'chunk/age/hubert_3s_test_{target_layer}.npz', features=layer_features, speaker_ids=speaker_ids)


100%|██████████| 129/129 [00:45<00:00,  2.82it/s]


In [5]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import soundfile as sf
import torch
from tqdm import tqdm
import numpy as np

# Load the processor and model for Wav2Vec2
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-960h")
model.config.output_hidden_states = True

target_layer = 2
device = torch.device("cpu")
model.to(device)

file_path = 'path_train.txt'
with open(file_path, 'r') as file:
    audio_files = [line.strip() for line in file]

speaker_ids = []
layer_features = []

for audio_file in tqdm(audio_files):
    speaker_id = audio_file.split('/')[-1].split('-')[0]  # Adjust based on your file path format
    speaker_ids.append(speaker_id)
    
    audio_input, sample_rate = sf.read(audio_file)
    
    input_values = processor(audio_input, sampling_rate=sample_rate, return_tensors="pt").input_values.to(device)
    
    with torch.no_grad():
        outputs = model(input_values)
    
    all_hidden_states = outputs.hidden_states
    layer_feature = all_hidden_states[target_layer].squeeze(0).cpu().numpy()
    
    pooled_feature = np.mean(layer_feature, axis=0)
    
    layer_features.append(pooled_feature)

layer_features = np.array(layer_features)
speaker_ids = np.array(speaker_ids)
# test
np.savez(f'chunk/gender/wav2vec2_3s_train_{target_layer}.npz', features=layer_features, speaker_ids=speaker_ids)


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 856/856 [04:09<00:00,  3.43it/s]


In [3]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import soundfile as sf
import torch
from tqdm import tqdm
import numpy as np

# Load the processor and model for Wav2Vec2
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-960h")
model.config.output_hidden_states = True

target_layer = 2
device = torch.device("cpu")
model.to(device)

file_path = 'path_test.txt'
with open(file_path, 'r') as file:
    audio_files = [line.strip() for line in file]

speaker_ids = []
layer_features = []

for audio_file in tqdm(audio_files):
    speaker_id = audio_file.split('/')[-1].split('-')[0]  # Adjust based on your file path format
    speaker_ids.append(speaker_id)
    
    audio_input, sample_rate = sf.read(audio_file)
    
    input_values = processor(audio_input, sampling_rate=sample_rate, return_tensors="pt").input_values.to(device)
    
    with torch.no_grad():
        outputs = model(input_values)
    
    all_hidden_states = outputs.hidden_states
    layer_feature = all_hidden_states[target_layer].squeeze(0).cpu().numpy()
    
    pooled_feature = np.mean(layer_feature, axis=0)
    
    layer_features.append(pooled_feature)

layer_features = np.array(layer_features)
speaker_ids = np.array(speaker_ids)

np.savez(f'chunk/gender/wav2vec2_1s_test_{target_layer}.npz', features=layer_features, speaker_ids=speaker_ids)


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 129/129 [00:22<00:00,  5.73it/s]
