# Clustering Audio Files Using Imagebind LLM Embeddings

## Installation
Cloning ImageBind repo and installing dependencies

In [1]:
!git clone https://github.com/facebookresearch/ImageBind.git
!pip install git+https://github.com/facebookresearch/pytorchvideo.git@28fe037d212663c6a24f373b94cc5d478c8c1a1d timm==0.6.7 ftfy regex einops fvcore decord==0.6.0

Cloning into 'ImageBind'...
remote: Enumerating objects: 117, done.[K
remote: Counting objects: 100% (66/66), done.[K
remote: Compressing objects: 100% (33/33), done.[K
remote: Total 117 (delta 45), reused 34 (delta 33), pack-reused 51[K
Receiving objects: 100% (117/117), 2.64 MiB | 22.15 MiB/s, done.
Resolving deltas: 100% (52/52), done.
Collecting git+https://github.com/facebookresearch/pytorchvideo.git@28fe037d212663c6a24f373b94cc5d478c8c1a1d
  Cloning https://github.com/facebookresearch/pytorchvideo.git (to revision 28fe037d212663c6a24f373b94cc5d478c8c1a1d) to /tmp/pip-req-build-t_4k1jde
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/pytorchvideo.git /tmp/pip-req-build-t_4k1jde
  Running command git rev-parse -q --verify 'sha^28fe037d212663c6a24f373b94cc5d478c8c1a1d'
  Running command git fetch -q https://github.com/facebookresearch/pytorchvideo.git 28fe037d212663c6a24f373b94cc5d478c8c1a1d
  Running command git checkout -q 28fe037d212

## Listening To A Audio File

In [2]:
%cd /content/ImageBind

/content/ImageBind


In [4]:
import IPython
from PIL import Image

IPython.display.Audio(f".assets/car_audio.wav")

## Finding Embeddings For Audio Files Using ImageBind

In [6]:
import os
# Listing all the files in the directory
assets_dir = ".assets/"
all_files = os.listdir(assets_dir)
all_files

['car_image.jpg',
 'dog_image.jpg',
 'bird_image.jpg',
 'dog_audio.wav',
 'bird_audio.wav',
 'car_audio.wav']

In [8]:
from imagebind import data
import torch
from imagebind.models import imagebind_model
from imagebind.models.imagebind_model import ModalityType

# Selecting only audio files
audio_paths=[os.path.join(assets_dir, file) for file in all_files if os.path.splitext(file)[1].lower() in [".wav"]]

device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Instantiate model
model = imagebind_model.imagebind_huge(pretrained=True)
model.eval()
model.to(device)

# Load data
inputs = {
    ModalityType.AUDIO: data.load_and_transform_audio_data(audio_paths, device),
}

with torch.no_grad():
    embeddings = model(inputs)

print(
    "Audio : ",
    torch.softmax(embeddings[ModalityType.AUDIO], dim=-1)
)

Audio :  tensor([[0.0008, 0.0005, 0.0022,  ..., 0.0011, 0.0007, 0.0004],
        [0.0010, 0.0006, 0.0012,  ..., 0.0006, 0.0009, 0.0009],
        [0.0016, 0.0010, 0.0003,  ..., 0.0013, 0.0006, 0.0013]],
       device='cuda:0')


## Clustering

In [9]:
from sklearn.cluster import KMeans
audio_embeddings = torch.softmax(embeddings[ModalityType.AUDIO], dim=-1)

# Convert the embeddings to a numpy array and detach them from the GPU
audio_text_embeddings_np = audio_embeddings.cpu().detach().numpy()

# Choose the number of clusters
n_clusters = 3

# Perform K-means clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
kmeans.fit(audio_text_embeddings_np)

# Get the cluster labels for each data point
cluster_labels = kmeans.labels_




In [10]:
cluster_labels

array([0, 2, 1], dtype=int32)