# Audio Clustering

```
Here, we will cluster the wavefiles based on their acoustic similarity. For this, we will use MFCC features.
```

In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
import librosa
import os

In [2]:
data_root = os.path.join('..', '..', 'data', 'anesthetized')
aa_4 = pd.read_csv('aa-4.csv')
aa_4.head()

Unnamed: 0,subject,site,single_unit,sortType,stim_number,original_wavfile,callerAge,callid,stim_duration,stim_source,stim_source_sex,stim_type,brain_region,hemisphere,vocid,auditory_cortex
0,BlaBro09xxF,Site1,Site1_L1500R1500_e10_s0_ss1,single,100,WhiWhi1415_FAF_Ne_3-8-9.wav,A,Ne,2.5,familiar,f,call,NCM,L,Ne,A2
1,BlaBro09xxF,Site1,Site1_L1500R1500_e10_s0_ss1,single,101,WhiWhi1415_FAF_Ne_6-4-5.wav,A,Ne,2.5,familiar,f,call,NCM,L,Ne,A2
2,BlaBro09xxF,Site1,Site1_L1500R1500_e10_s0_ss1,single,102,WhiWhi1415_FAF_Ne_7-2-10.wav,A,Ne,2.5,familiar,f,call,NCM,L,Ne,A2
3,BlaBro09xxF,Site1,Site1_L1500R1500_e10_s0_ss1,single,103,WhiWhi1415_FAF_Te_10-9-3.wav,A,Te,2.5,familiar,f,call,NCM,L,Te,A2
4,BlaBro09xxF,Site1,Site1_L1500R1500_e10_s0_ss1,single,104,WhiWhi1415_FAF_Te_4-6-2.wav,A,Te,2.5,familiar,f,call,NCM,L,Te,A2


In [8]:
d = {'a': 1}
d['2'] = 2
d

{'a': 1, '2': 2}

In [13]:
def agglomerative_clustering(audio_folder, valid_filenames, n_clusters=10):
    subject = audio_folder[-20:-9]
    subject_data = []
    
    # Define the feature extraction function
    def extract_features(file_path):
        # Load the audio file
        y, sr = librosa.load(file_path, sr=None)

        # Extract the features
        features = []
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
        features.extend(np.mean(mfccs, axis=1))
        spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)
        features.extend(np.mean(spectral_centroids, axis=1))
        spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
        features.extend(np.mean(spectral_rolloff, axis=1))

        return features

    # Create an empty list to store the features
    features_list = []

    # Loop over all the files in the folder and extract the features
    for file_name in valid_filenames:
        if file_name.endswith('.wav'):
            file_path = os.path.join(audio_folder, file_name)
            features = extract_features(file_path)
            features_list.append(features)

    # Convert the features list to a numpy array
    X = np.array(features_list)

    # Create an instance of the AgglomerativeClustering class
    agg_clustering = AgglomerativeClustering(n_clusters=n_clusters)

    # Fit the data to the model
    agg_clustering.fit(X)

    # Get the labels of the clusters
    labels = agg_clustering.labels_

    # Print the labels for each file
    for i, file_name in enumerate(valid_filenames):
        if file_name.endswith('.wav'):
            #print(file_name, ': Cluster', labels[i])
            file_data = {}
            file_data['subject'] = subject
            file_data['stim_number'] = file_name
            file_data['cluster'] = labels[i]
            
            subject_data.append(file_data)
    return subject_data

In [14]:
folder_paths = [os.path.join(data_root, folder, 'wavfiles') for folder in os.listdir(data_root) if os.path.isdir(os.path.join(data_root, folder))]

data = []
for folder_path in folder_paths:
    subject = folder_path[-20:-9]
    df = aa_4[aa_4['subject'] == subject]
    valid_filenames = ['stim' + str(s) + '.wav' for s in df['stim_number'].unique()]
    
    print(subject)
    if subject.startswith('WhiBlu'):
        continue
    subject_data = agglomerative_clustering(folder_path, valid_filenames, n_clusters=10)
    data.extend(subject_data)
print('All done!')

BlaBro09xxF
GreBlu9508M
LblBlu2028M
WhiBlu5396M
WhiWhi4522M




YelBlu6903F
All done!


In [22]:
cluster_df = pd.DataFrame(data)
cluster_df.sample(n=5)

Unnamed: 0,subject,stim_number,cluster
256,LblBlu2028M,stim198.wav,5
515,YelBlu6903F,stim149.wav,9
74,BlaBro09xxF,stim44.wav,1
463,WhiWhi4522M,stim85.wav,3
220,GreBlu9508M,stim80.wav,1


In [31]:
stim_numbers = [int(s[4:s.find('.')]) for s in cluster_df['stim_number']]
cluster_df['stim_number'] = stim_numbers

In [32]:
cluster_df.to_csv('cluster_df.csv', index=False)