In [1]:
import os
import random
import librosa
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity


In [9]:
sr = 44100
num_files = 4

In [3]:
def compare_audio_folders(folder1, folder2, max_second):
    
    max_length = sr * max_second
    # Get random 20 files from each folder
    files1 = random.sample(os.listdir(folder1), num_files)
    files2 = random.sample(os.listdir(folder2), num_files)
    
    # Function to load, pad/truncate audio
    def load_audio(file_path):
        y, _ = librosa.load(file_path, sr=sr)
        if len(y) > max_length:
            y = y[:max_length]
        else:
            y = np.pad(y, (0, max_length - len(y)), 'constant')
        return y
    
    data = []
    
    for file1,file2 in zip(files1,files2):

        path1 = os.path.join(folder1, file1)
        path2 = os.path.join(folder2, file2)
        
        y1 = load_audio(path1)
        y2 = load_audio(path2)
        
        # Euclidean distance on waveform
        euclidean_dist = np.linalg.norm(y1 - y2)
        
        # Mel spectrogram
        mel1 = librosa.feature.melspectrogram(y=y1, sr=sr)
        mel2 = librosa.feature.melspectrogram(y=y2, sr=sr)
        
        # Flatten the Mel spectrograms to 1D for cosine similarity
        mel1_flat = mel1.flatten()
        mel2_flat = mel2.flatten()
        
        # Cosine distance on Mel spectrogram
        cosine_dist = 1 - cosine_similarity([mel1_flat], [mel2_flat])[0, 0]
        
        data.append([path1, path2, euclidean_dist, cosine_dist])
    
    # Create DataFrame
    df = pd.DataFrame(data, columns=['audio_path1', 'audio_path2', 'euclidean_distance', 'cosine_distance'])
    
    return df


In [44]:
from pydub import AudioSegment
from pydub.silence import split_on_silence
import os

def remove(input_file, output_file, min_silence_len=200, silence_thresh=-45):
    audio = AudioSegment.from_file(input_file)
    chunks = split_on_silence(audio, min_silence_len=min_silence_len,silence_thresh=silence_thresh)
    output = AudioSegment.empty()
    for chunk in chunks:
        output += chunk
    output.export(output_file,format='wav')


def process_audio(source_folder,destination_folder,audio_name):
    for idx,file_name in enumerate(os.listdir(source_folder)):
        f = f'{audio_name}_{idx}.wav'
        source_file_path = os.path.join(source_folder, file_name)
        destination_file_path = os.path.join(destination_folder, f)
        remove(source_file_path,destination_file_path)


source_folder = r"D:\data_code\data\recording\system\hilfe_hilfe"
destination_folder = r"D:\data_code\data\recording\system\hilfe_hilfe"
process_audio(source_folder,destination_folder,'Hilfe_hilfe')

In [30]:

# Example usage
folder1 = r'D:\data_code\data\adele\2 percentile'
folder2 = r'D:\data_code\data\adele\2 percentile'
max_second = 2
df = compare_audio_folders(folder1, folder2, max_second)
df.describe()

Unnamed: 0,euclidean_distance,cosine_distance
count,4.0,4.0
mean,40.654564,0.908634
std,16.969013,0.071855
min,19.399591,0.818691
25%,31.490073,0.882336
50%,42.519388,0.910943
75%,51.683878,0.937241
max,58.179882,0.993956


In [43]:

# Example usage
folder1 = r"D:\data_code\data\recording\system\adele"
folder2 =  r"D:\data_code\data\recording\system\adele"
max_second = 2
df = compare_audio_folders(folder1, folder2, max_second)
df.describe()

Unnamed: 0,euclidean_distance,cosine_distance
count,4.0,4.0
mean,7.044243,0.3024013
std,6.902939,0.2514607
min,0.0,-2.384186e-07
25%,2.270155,0.1973933
50%,6.386243,0.2989496
75%,11.160331,0.4039575
max,15.404487,0.6117063
