In [1]:
import pandas as pd
import numpy as np
from transformers import AutoFeatureExtractor
import librosa
import soundfile as sf
import resampy
from pydub import AudioSegment
from tqdm.notebook import tqdm


In [2]:
df = pd.read_pickle('AnnoMI-full-with-audio-cleaned-text.pkl')

In [3]:
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-large" , sampling_rate=16000, return_attention_mask=True)

In [4]:
# audio, _ = librosa.load('current_wav2vec.wav', sr=16000)
    
    
# inputs = feature_extractor(audio, sampling_rate=feature_extractor.sampling_rate, max_length= 16000, truncation=True)

# inputs['input_values']

# np.array(inputs['input_values']).shape

In [5]:
def resample_audio(filename, target_sr):
    y, sr = librosa.load(filename, sr=None)  # Load with the original sample rate
    y_resampled = resampy.resample(y, sr, target_sr)  # Resample to target sample rate
    sf.write(filename, y_resampled, target_sr)

def compute_embeddings(audio_list):
    # If audio_list is empty, pad with silence
    if audio_list is None:
        return None
    if len(audio_list) == 0:
        pad_ms = 1000  # milliseconds of silence needed
        silence = AudioSegment.silent(duration=pad_ms)
        silence.export('current_wav2vec.wav', format='wav')
        resample_audio('current_wav2vec.wav', 16000)
    else:
        audio_list = audio_list.tolist()
        
        # Print the length of the audio list and calculate the duration
        print(f"Length of audio list: {len(audio_list)}")
        print(f"Original duration: {len(audio_list) / 44100} seconds")
        
        # Save the audio data with its original sampling rate
        sf.write('current_wav2vec.wav', audio_list, 44100)

    # Resample to the desired sampling rate
    resample_audio('current_wav2vec.wav', 16000)

    # Load the resampled audio
    audio, _ = librosa.load('current_wav2vec.wav', sr=16000)
    print(f"Resampled duration: {len(audio) / 16000} seconds")
    
    
    inputs = feature_extractor(audio, sampling_rate=feature_extractor.sampling_rate)
    # inputs = np.array(inputs['input_values'])
    
    return inputs

In [6]:
# Create two new columns for embeddings and initialize with None
df['client_wav2vec_emb'] = None
df['therapist_wav2vec_emb'] = None

# Iterate through the DataFrame
for index, row in tqdm(df.iterrows(), total=df.shape[0], desc='Processing'):
    # Determine the role (client or therapist) from the interlocutor column
    role = row['interlocutor']
    video_title = row['video_title']
    utterance = row['utterance_id']
    
    print(f'Processing {role} {utterance} {video_title}...')

    audio_list = row[f'{role}_audio_utterance']

    # Compute the embeddings for the given role
    embeddings = compute_embeddings(audio_list)

    # Save the embeddings in the DataFrame
    df.at[index, f'{role}_wav2vec_emb'] = embeddings

Processing:   0%|          | 0/12778 [00:00<?, ?it/s]

Processing therapist 0 NEW VIDEO: Brief intervention: "Barbara"...
Length of audio list: 485100
Original duration: 11.0 seconds
Resampled duration: 11.0 seconds
Processing client 1 NEW VIDEO: Brief intervention: "Barbara"...
Length of audio list: 44100
Original duration: 1.0 seconds
Resampled duration: 1.0 seconds
Processing therapist 2 NEW VIDEO: Brief intervention: "Barbara"...
Length of audio list: 396900
Original duration: 9.0 seconds
Resampled duration: 9.0 seconds
Processing client 3 NEW VIDEO: Brief intervention: "Barbara"...
Resampled duration: 1.0 seconds
Processing therapist 4 NEW VIDEO: Brief intervention: "Barbara"...
Length of audio list: 220500
Original duration: 5.0 seconds
Resampled duration: 5.0 seconds
Processing client 5 NEW VIDEO: Brief intervention: "Barbara"...
Length of audio list: 132300
Original duration: 3.0 seconds
Resampled duration: 3.0 seconds
Processing therapist 6 NEW VIDEO: Brief intervention: "Barbara"...
Length of audio list: 176400
Original duration:

In [7]:
df.to_pickle('AnnoMI-wav2vec-new.pkl')