In [1]:
import librosa
import torch
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
import numpy as np
import pandas as pd

import os

In [2]:
cwd = os.getcwd()
parent_dir = os.path.dirname(cwd)

In [3]:
signals = []
file_list=[]

source_dir = f'{parent_dir}/Audio/'

for subdir, dirs, files in os.walk(source_dir):
  for file in files:
    if file.endswith('.wav'):
        y, sr = librosa.load(subdir+'/'+file, sr=16000)
        if len(y) < 16000:
          y=np.pad(y, (0, 16000-len(y)), 'constant', constant_values=0)
        
        signals.append(y)
        file_list.append(file)

In [4]:
model_name = "facebook/wav2vec2-large-xlsr-53"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2Model.from_pretrained(model_name)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [5]:
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Wav2Vec2Model(
  (feature_extractor): Wav2Vec2FeatureEncoder(
    (conv_layers): ModuleList(
      (0): Wav2Vec2LayerNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
      (1-4): 4 x Wav2Vec2LayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
      (5-6): 2 x Wav2Vec2LayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
        (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): Wav2Vec2FeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=1024, bias=True)
    (dropout)

In [6]:
# Pre-allocate memory
feature_states = torch.empty(size=(len(signals), 512), device=device)
hidden_states = torch.empty(size=(len(signals), 1024), device=device)

j=0
print('starting feature extraction...')
# Process signals in batches
batch_size = 32  # Adjust batch size as needed
for i in range(0, len(signals), batch_size):
    j+=1
    print(f'{j} of {len(range(0, len(signals), batch_size))}')
    batch_signals = signals[i:i+batch_size]

    inputs = feature_extractor(batch_signals, return_tensors="pt", sampling_rate=16000, padding=True, device=device)
    input_values = inputs.input_values.to(device)

    with torch.no_grad():
        outputs = model(input_values=input_values)

    sig_feature_state = torch.mean(outputs.extract_features, axis=1)
    
    feature_states[i:i+batch_size] = sig_feature_state

starting feature extraction...
1 of 63


model.safetensors:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

2 of 63
3 of 63
4 of 63
5 of 63
6 of 63
7 of 63
8 of 63
9 of 63
10 of 63
11 of 63
12 of 63
13 of 63
14 of 63
15 of 63
16 of 63
17 of 63
18 of 63
19 of 63
20 of 63
21 of 63
22 of 63
23 of 63
24 of 63
25 of 63
26 of 63
27 of 63
28 of 63
29 of 63
30 of 63
31 of 63
32 of 63
33 of 63
34 of 63
35 of 63
36 of 63
37 of 63
38 of 63
39 of 63
40 of 63
41 of 63
42 of 63
43 of 63
44 of 63
45 of 63
46 of 63
47 of 63
48 of 63
49 of 63
50 of 63
51 of 63
52 of 63
53 of 63
54 of 63
55 of 63
56 of 63
57 of 63
58 of 63
59 of 63
60 of 63
61 of 63
62 of 63
63 of 63


In [7]:
print(feature_states.shape)

torch.Size([2000, 512])


In [8]:
df = pd.DataFrame(feature_states.numpy())
df['file']=file_list

In [9]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,503,504,505,506,507,508,509,510,511,file
0,0.198858,-0.671096,-0.5538,0.430372,0.689427,0.451664,0.513925,-0.669328,3.029443,-0.140925,...,-0.51932,-0.467423,0.029339,0.75236,-0.57808,1.039529,0.202896,1.36336,0.883992,Atrophy-00002mg.wav
1,0.545076,-0.047678,-1.163094,0.268825,0.509433,-0.136339,0.201685,-0.742711,-0.094478,0.041153,...,-1.307732,1.091318,-0.360932,0.273519,-0.935269,-0.079642,-0.262874,-0.492179,0.851082,Atrophy-0001297.wav
2,0.298088,-0.969536,-1.34267,0.438397,-0.010687,0.756729,0.694833,2.41233,0.193468,2.678017,...,-0.375085,-1.58867,-1.013133,0.818641,4.83507,-0.30983,0.503137,1.045043,0.54652,Atrophy-0001apo.wav
3,0.29754,-0.991699,2.728217,-0.110462,0.275403,0.368062,-0.597716,-0.435606,6.930978,-0.512924,...,-0.504315,-1.465303,-1.550031,0.370241,0.287871,1.318348,0.779946,2.578025,0.35936,Atrophy-0001qd3.wav
4,0.569071,-0.708247,-0.53242,0.829588,0.291864,-0.042466,-0.483136,-1.057416,2.885131,0.608033,...,-0.854743,1.912285,1.891305,0.523399,-0.637222,0.400236,0.318714,-0.948447,0.026111,Atrophy-0002ipt.wav


In [15]:
df.to_csv(f'{parent_dir}/Audio Features/FeatureStates_FEMH.csv')