In [3]:
# This file is to extract the features from the audio files into embeddings 

import os
import json
import librosa
import torch
from tqdm import tqdm
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import numpy as np


In [1]:
import os
import torch
import librosa
import numpy as np
from tqdm import tqdm
import json
from transformers import ASTFeatureExtractor, ASTModel

# Load AST model and feature extractor
model_name = "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = ASTFeatureExtractor.from_pretrained(model_name)
model = ASTModel.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.eval()

path_name = 'audio/final'
files = os.listdir(path_name)

# Hyperparameters for windowing
window_size = 2  # Window size in seconds
stride = 1  # Stride in seconds

result = {}

for file in tqdm(files):
    audio_file = os.path.join(path_name, file)
    file_id = file.split('.')[0]
    
    # Load audio file
    input_audio, sample_rate = librosa.load(audio_file, sr=16000)
    total_duration = librosa.get_duration(y=input_audio, sr=sample_rate)
    
    # Compute window parameters
    window_length = window_size * sample_rate
    stride_length = stride * sample_rate
    
    # Split audio into windows
    windows = []
    for start in range(0, len(input_audio) - window_length + 1, stride_length):
        end = start + window_length
        windows.append(input_audio[start:end])
    
    embeddings = []
    
    # Process each window
    for window in windows:
        # Extract features using AST feature extractor
        inputs = feature_extractor(
            window, 
            sampling_rate=sample_rate,
            return_tensors="pt"
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Extract features using the AST model
        with torch.no_grad():
            outputs = model(**inputs)
            
            # Get the mean-pooled embedding for this window
            # AST outputs last_hidden_state of shape (batch_size, sequence_length, hidden_size)
            window_embedding = torch.mean(outputs.last_hidden_state, dim=1).squeeze().cpu().numpy()
            embeddings.append(window_embedding)

    # Mean pool across all window embeddings for a global feature vector
    mean_pooled_embedding = np.mean(embeddings, axis=0).tolist()
    
    # Store the result for this audio file
    result[file_id] = mean_pooled_embedding

# Save the extracted features
with open('audio_features_ast.json', 'w') as f:
    json.dump(result, f, indent=4)

preprocessor_config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 690/690 [18:52<00:00,  1.64s/it]


In [1]:
import os
import librosa
import numpy as np
from tqdm import tqdm
import json
import opensmile

# Initialize openSMILE with a desired feature set and level.
# Here we use the ComParE_2016 feature set with Functionals (which returns a single vector per signal).
smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.ComParE_2016,
    feature_level=opensmile.FeatureLevel.Functionals,
)

path_name = 'audio/final'
files = os.listdir(path_name)

# Hyperparameters for windowing
window_size = 2  # Window size in seconds
stride = 1       # Stride in seconds

result = {}

for file in tqdm(files):
    audio_file = os.path.join(path_name, file)
    file_id = os.path.splitext(file)[0]  # file name without extension
    
    # Load audio file (resampled to 16kHz)
    input_audio, sample_rate = librosa.load(audio_file, sr=16000)
    total_duration = librosa.get_duration(y=input_audio, sr=sample_rate)
    
    # Calculate window and stride lengths in samples
    window_length = int(window_size * sample_rate)
    stride_length = int(stride * sample_rate)
    
    # Split audio into overlapping windows
    windows = []
    for start in range(0, len(input_audio) - window_length + 1, stride_length):
        end = start + window_length
        windows.append(input_audio[start:end])
    
    embeddings = []
    
    # Process each window with openSMILE
    for window in windows:
        # Process the window to extract features; returns a DataFrame with one row
        features_df = smile.process_signal(window, sample_rate)
        # Extract the feature vector from the DataFrame
        window_features = features_df.iloc[0].values.astype(float)
        embeddings.append(window_features)
    
    # Aggregate window embeddings (mean pooling) into a single global feature vector
    if embeddings:
        mean_pooled_embedding = np.mean(embeddings, axis=0).tolist()
    else:
        mean_pooled_embedding = None  # or handle files shorter than window_size appropriately
    
    # Store the result for this audio file
    result[file_id] = mean_pooled_embedding

# Save the extracted features to a JSON file
with open('audio_features_opensmile.json', 'w') as f:
    json.dump(result, f, indent=4)


100%|██████████| 690/690 [03:01<00:00,  3.79it/s]


In [5]:
with open('audio_features_opensmile.json', 'w') as f:
    json.dump(result, f, indent=4)

In [6]:
for key, val in result.items():
    if len(val) != 6373:
        print(key, len(val))