In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.compose import ColumnTransformer

import joblib

import matplotlib.pyplot as plt
import seaborn as sns
import os
import torch
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
import librosa

In [2]:
cwd = os.getcwd()

In [3]:
parent_dir = os.path.dirname(cwd)

In [4]:
trained_pipeline = joblib.load(f'{parent_dir}/FeatureSelection/SVM_FeatureStates_Rec_Only.pkl')

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

In [6]:
class Wav2Vec2BatchFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, model_name="facebook/wav2vec2-large-xlsr-53", batch_size=32):
        self.model_name = model_name
        self.batch_size = batch_size
        self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(self.model_name)
        self.model = Wav2Vec2Model.from_pretrained(self.model_name)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Extract file paths from the specified DataFrame column
        file_paths = X

        num_samples = len(file_paths)
        feature_states = torch.empty(size=(num_samples, 512), device=self.device)

        # Process files in batches
        for i in range(0, num_samples, self.batch_size):
            batch_paths = file_paths[i:i + self.batch_size]

            # Load audio files
            batch_signals = []
            for file_path in batch_paths:
                signal, _ = librosa.load(file_path, sr=16000)
                batch_signals.append(signal)

            # Process signals with wav2vec2
            inputs = self.feature_extractor(batch_signals, return_tensors="pt", sampling_rate=16000, padding=True)
            input_values = inputs.input_values.to(self.device)

            with torch.no_grad():
                outputs = self.model(input_values=input_values)

            sig_feature_state = torch.mean(outputs.extract_features, axis=1)

            feature_states[i:i + self.batch_size] = sig_feature_state

        # Convert to a DataFrame with column names corresponding to the features
        columns = list(map(str, list(range(512))))
        feature_df = pd.DataFrame(feature_states, dtype=np.float64, columns=columns)

        return feature_df

In [7]:
new_pipeline = Pipeline([
    ('features', Wav2Vec2BatchFeatureExtractor()),
    ('existing', trained_pipeline)  
])

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
audio_dir = f'{parent_dir}/Audio/'
test_audio = [f'{audio_dir}/Atrophy-0A00e04.wav', f'{audio_dir}/Laryngeal cancer-0702gr9.wav']
# test_audio = [f'{audio_dir}/Atrophy-0A00e04.wav']

In [12]:
y_pred = new_pipeline.predict(test_audio)

In [13]:
y_pred

array(['Benign'], dtype=object)