## Import testing dataset

The test dataset will be fetched from Kaggle.

See [freesound-audio-tagging-2019 dataset](https://www.kaggle.com/c/freesound-audio-tagging-2019).

To run the below code you need to generate kaggle.json, by loggining into your kaggle account > Settings > Generate Api Key.

In [None]:
!pip install jovian --upgrade --quiet

In [None]:
import jovian

In [None]:
!mkdir ~/.kaggle && \
cp kaggle.json ~/.kaggle/ && \
chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c freesound-audio-tagging-2019

In [None]:
## Unzip dataset files
from zipfile import ZipFile

def extract_zip(file, extract_dir = ".", only_first_X_files=None):
    # open the zip file in read mode
    with ZipFile(file, 'r') as ziip:
        # list all the contents of the zip file
        all_files = ziip.namelist()

        if only_first_X_files is not None:
            assert only_first_X_files >= 0
            # Get the first X files
            first_X_files = all_files[:only_first_X_files]

            # extract the first X files to the specified directory
            print('Extracting the first', only_first_X_files, 'files...')
            for file in first_X_files:
                ziip.extract(file, path=extract_dir)
            print('Done!')
        else:
            # extract all files to the specified directory
            print('Extracting all files...')
            ziip.extractall(path=extract_dir)
            print('Done!')

all_zips = "freesound-audio-tagging-2019.zip"

extract_zip(all_zips)

In [None]:
!mkdir AudioSep/data/noisy AudioSep/data/curated

In [None]:
zip_noisy = "train_noisy.zip"
extract_zip(zip_noisy, "AudioSep/data/noisy",100 )

zip_noisy = "train_curated.zip"
extract_zip(zip_noisy, "AudioSep/data/curated",100 )

In [None]:
!rm -rf freesound-audio-tagging-2019.zip test.zip sample_submission.csv train_curated.zip && rm -rf train_noisy.zip

In [None]:
import pandas as pd

data_dir_noisy = "AudioSep/data/noisy"
data_dir_curated = "AudioSep/data/curated"

noisy_df = pd.read_csv("train_noisy.csv")
curated_df = pd.read_csv("train_curated.csv")


noisy_df.head()

In [None]:
classes_noisy = noisy_df['labels'].str.split(',').explode()
classes_curated = curated_df['labels'].str.split(',').explode()

classes_noisy = classes_noisy.unique()
classes_curated= classes_curated.unique()

print("Noisy: \n\n", classes_noisy)
print("\n\nCurated: \n\n", classes_curated)
print("\n\n\nCurated == Noisy? ", classes_noisy.sort() ==classes_curated.sort() )

classes = classes_noisy

print("\n\n\nNum of classes ", len(classes))

In [None]:
import numpy as np
import os

def list_files_in_directory(directory):
    # List all files in the directory
    files = []
    for root, dirs, filenames in os.walk(directory):
        for filename in filenames:
            files.append(os.path.join(root, filename))

    # Convert the list to a NumPy array
    files_array = np.array(files)
    return files_array


data_dir_noisy = "AudioSep/data/noisy"
data_dir_curated = "AudioSep/data/curated"

files_curated = list_files_in_directory(data_dir_curated)[:50]
files_noisy = list_files_in_directory(data_dir_noisy)[:50]

## Separation
with *Separate Anything You Describe*

In [None]:
from pathlib import Path

repo_path = Path("/content/AudioSep")
if not repo_path.exists():
    !git clone https://github.com/Audio-AGI/AudioSep.git

%cd /content/AudioSep

Cloning into 'AudioSep'...
remote: Enumerating objects: 273, done.[K
remote: Counting objects: 100% (115/115), done.[K
remote: Compressing objects: 100% (57/57), done.[K
remote: Total 273 (delta 83), reused 58 (delta 58), pack-reused 158[K
Receiving objects: 100% (273/273), 16.52 MiB | 14.74 MiB/s, done.
Resolving deltas: 100% (122/122), done.
/content/AudioSep


In [None]:
!pip install torchlibrosa==0.1.0 gradio==3.47.1 gdown lightning transformers==4.28.1 ftfy braceexpand webdataset soundfile wget h5py

Collecting torchlibrosa==0.1.0
  Downloading torchlibrosa-0.1.0-py3-none-any.whl (11 kB)
Collecting gradio==3.47.1
  Downloading gradio-3.47.1-py3-none-any.whl (20.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.3/20.3 MB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
Collecting lightning
  Downloading lightning-2.2.5-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m64.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.28.1
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m70.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ftfy
  Downloading ftfy-6.2.0-py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting braceexpand
  Downloading braceexpand-0.1.7-py2.py3-none-any.whl (5.9 kB)
Collecting we

In [None]:
checkpoints_dir = Path("checkpoint")
checkpoints_dir.mkdir(exist_ok=True)

models = (
    (
        "https://huggingface.co/spaces/badayvedat/AudioSep/resolve/main/checkpoint/audiosep_base_4M_steps.ckpt",
        checkpoints_dir / "audiosep_base_4M_steps.ckpt"
    ),
    (
        "https://huggingface.co/spaces/badayvedat/AudioSep/resolve/main/checkpoint/music_speech_audioset_epoch_15_esc_89.98.pt",
        checkpoints_dir / "music_speech_audioset_epoch_15_esc_89.98.pt"
    )
)

for model_url, model_path in models:
    if not model_path.exists():
        !wget {model_url} -O {model_path}

--2024-06-04 11:38:23--  https://huggingface.co/spaces/badayvedat/AudioSep/resolve/main/checkpoint/audiosep_base_4M_steps.ckpt
Resolving huggingface.co (huggingface.co)... 3.163.189.90, 3.163.189.37, 3.163.189.114, ...
Connecting to huggingface.co (huggingface.co)|3.163.189.90|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/repos/25/5f/255f11fd2743cf4de0c72f1eba5cdf82634d22af3ce4fcc5b16dbd7b7feddaca/f8cda01bfd0ebd141eef45d41db7a3ada23a56568465840d3cff04b8010ce82c?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27audiosep_base_4M_steps.ckpt%3B+filename%3D%22audiosep_base_4M_steps.ckpt%22%3B&Expires=1717760303&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcxNzc2MDMwM319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9yZXBvcy8yNS81Zi8yNTVmMTFmZDI3NDNjZjRkZTBjNzJmMWViYTVjZGY4MjYzNGQyMmFmM2NlNGZjYzViMTZkYmQ3YjdmZWRkYWNhL2Y4Y2RhMDFiZmQwZWJkMTQxZWVmNDVkNDFkYjdhM2F

In [None]:
!mkdir ../output ../output/curated ../output/noisy

In [None]:
import torch
from pipeline import build_audiosep, separate_audio

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = build_audiosep(
      config_yaml='config/audiosep_base.yaml',
      checkpoint_path=str(models[0][1]),
      device=device)

for audio_file in [files_curated[0]]:
  for class_label in classes:
    output_file="../output/curated/" + class_label + "_" + audio_file.split("/")[-1]
    # AudioSep processes the audio at 32 kHz sampling rate
    separate_audio(model, audio_file, class_label, output_file, device)
    print(f"The separated audio is saved to: '{output_file}' file.")

for audio_file in files_noisy:
  for class_label in classes:
    output_file="../output/noisy/" + class_label + "_" + audio_file.split("/")[-1]
    # AudioSep processes the audio at 32 kHz sampling rate
    separate_audio(model, audio_file, class_label, output_file, device)
    print(f"The separated audio is saved to: '{output_file}' file.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another

[1;30;43mStrumieniowane dane wyjściowe obcięte do 5000 ostatnich wierszy.[0m
Separated audio written to [output/noisy/Cheering_4604bbc5.wav]
The separated audio is saved to: 'output/noisy/Cheering_4604bbc5.wav' file.
Separating audio from [data/noisy/4604bbc5.wav] with textual query: [Chewing_and_mastication]
Separated audio written to [output/noisy/Chewing_and_mastication_4604bbc5.wav]
The separated audio is saved to: 'output/noisy/Chewing_and_mastication_4604bbc5.wav' file.
Separating audio from [data/noisy/4604bbc5.wav] with textual query: [Child_speech_and_kid_speaking]
Separated audio written to [output/noisy/Child_speech_and_kid_speaking_4604bbc5.wav]
The separated audio is saved to: 'output/noisy/Child_speech_and_kid_speaking_4604bbc5.wav' file.
Separating audio from [data/noisy/4604bbc5.wav] with textual query: [Chink_and_clink]
Separated audio written to [output/noisy/Chink_and_clink_4604bbc5.wav]
The separated audio is saved to: 'output/noisy/Chink_and_clink_4604bbc5.wav' f

## Classification
with MS CLAP

In [None]:
!pip install msclap

Collecting msclap
  Downloading msclap-1.3.3-py3-none-any.whl (31 kB)
Collecting scikit-learn<2.0.0,>=1.3.1 (from msclap)
  Downloading scikit_learn-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m
Collecting torchvision<0.17.0,>=0.16.0 (from msclap)
  Downloading torchvision-0.16.2-cp310-cp310-manylinux1_x86_64.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m68.3 MB/s[0m eta [36m0:00:00[0m
Collecting transformers<5.0.0,>=4.34.0 (from msclap)
  Downloading transformers-4.41.2-py3-none-any.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m94.8 MB/s[0m eta [36m0:00:00[0m
INFO: pip is looking at multiple versions of torchvision to determine which version is compatible with other requirements. This could take a while.
Collecting torchvision<0.17

In [None]:
"""
Source of this code: https://github.com/microsoft/CLAP/tree/main/examples

This is an example using CLAP for zero-shot inference.
"""
from msclap import CLAP
import torch.nn.functional as F

def clap_predict( classes, audio_files, clap_model ):
  # Add prompt
  prompt = 'this is a sound of '
  class_prompts = [prompt + x for x in classes]

  # compute text embeddings from natural text
  text_embeddings = clap_model.get_text_embeddings(class_prompts)

  # compute the audio embeddings from an audio file
  audio_embeddings = clap_model.get_audio_embeddings(audio_files, resample=True)

  # compute the similarity between audio_embeddings and text_embeddings
  similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)

  similarity = F.softmax(similarity, dim=1)
  values, indices = similarity[0].topk(len(classes))

  return values, indices


In [None]:
import pandas as pd
import numpy as np

# Initialize an empty DataFrame with columns for filename and each class for results
columns = ['fname', 'separated_class'].append(classes)
noisy_results_df = pd.DataFrame(columns=columns)
curated_results_df = pd.DataFrame(columns=columns)

separated_noisy = list_files_in_directory("output/noisy")
separated_curated = list_files_in_directory("output/curated")

# Load and initialize CLAP
# Setting use_cuda = True will load the model on a GPU using CUDA
clap_model = CLAP(version = '2023', use_cuda=True)
print("Clap loaded!")

for audio_file in separated_noisy:
    values, indices = clap_predict(classes, [audio_file], clap_model)
    results = {class_name: 0 for class_name in classes}
    for value, index in zip(values, indices):
        results[classes[index]] = round(value.item(), 4)
    row = {'fname': audio_file.split("_")[-1],
           'separated_class': audio_file.split("/")[-1].split("_")[0] }
    row.update(results)
    row_df = pd.DataFrame([row])
    noisy_results_df = pd.concat([noisy_results_df, row_df], ignore_index=True)
    print(f"{audio_file} classified succesfully.")

for audio_file in separated_curated:  # Process the first file only as in your example
    values, indices = clap_predict(classes, [audio_file], clap_model)
    results = {class_name: 0 for class_name in classes}
    for value, index in zip(values, indices):
        results[classes[index]] = round(value.item(), 4)
    row = {'fname': audio_file.split("_")[-1],
           'separated_class': audio_file.split("/")[-1].split("_")[0] }
    row.update(results)
    row_df = pd.DataFrame([row])
    curated_results_df = pd.concat([curated_results_df, row_df], ignore_index=True)
    print(f"{audio_file} classified succesfully.")



Clap loaded!
AudioSep/output/noisy/Raindrop_4604bbc5.wav classified succesfully.
AudioSep/output/noisy/Mechanical_fan_7dc80beb.wav classified succesfully.
AudioSep/output/noisy/Meow_3119d5b6.wav classified succesfully.
AudioSep/output/noisy/Squeak_a53b412b.wav classified succesfully.
AudioSep/output/noisy/Race_car_and_auto_racing_354934ea.wav classified succesfully.
AudioSep/output/noisy/Child_speech_and_kid_speaking_54260b02.wav classified succesfully.
AudioSep/output/noisy/Knock_eb268626.wav classified succesfully.
AudioSep/output/noisy/Gasp_92f9a71b.wav classified succesfully.
AudioSep/output/noisy/Hi-hat_eb72d661.wav classified succesfully.
AudioSep/output/noisy/Car_passing_by_3119d5b6.wav classified succesfully.
AudioSep/output/noisy/Acoustic_guitar_eb72d661.wav classified succesfully.
AudioSep/output/noisy/Sigh_ad9690a8.wav classified succesfully.
AudioSep/output/noisy/Screaming_eb268626.wav classified succesfully.
AudioSep/output/noisy/Gurgling_abedfe71.wav classified succesfull

In [None]:
noisy_results_df.head()

Unnamed: 0,fname,separated_class,Accelerating_and_revving_and_vroom,Accordion,Acoustic_guitar,Applause,Bark,Bass_drum,Bass_guitar,Bathtub_(filling_or_washing),...,Toilet_flush,Traffic_noise_and_roadway_noise,Trickle_and_dribble,Walk_and_footsteps,Water_tap_and_faucet,Waves_and_surf,Whispering,Writing,Yell,Zipper_(clothing)
0,4604bbc5.wav,Raindrop,0.0,0.0,0.0,0.0,0.0248,0.0,0.0,0.0015,...,0.0007,0.0001,0.0,0.0053,0.0039,0.0,0.0048,0.0044,0.0001,0.0049
1,7dc80beb.wav,Mechanical,0.0014,0.0,0.0,0.0,0.0001,0.0,0.0,0.0008,...,0.0004,0.0429,0.0,0.0005,0.0001,0.0,0.0062,0.0006,0.0,0.0014
2,3119d5b6.wav,Meow,0.0233,0.0,0.0016,0.0008,0.0002,0.0,0.0001,0.006,...,0.001,0.0048,0.0003,0.002,0.0151,0.0006,0.0007,0.0008,0.0,0.0392
3,a53b412b.wav,Squeak,0.0001,0.0,0.0,0.0,0.0,0.0,0.0,0.0714,...,0.0027,0.0044,0.0,0.002,0.2008,0.0001,0.0009,0.0,0.0,0.0
4,354934ea.wav,Race,0.0031,0.0,0.0001,0.0,0.0002,0.0,0.0,0.0014,...,0.0002,0.0001,0.0,0.002,0.0011,0.0,0.0001,0.1311,0.0,0.0034


In [None]:
curated_results_df.head()

Unnamed: 0,fname,separated_class,Accelerating_and_revving_and_vroom,Accordion,Acoustic_guitar,Applause,Bark,Bass_drum,Bass_guitar,Bathtub_(filling_or_washing),...,Toilet_flush,Traffic_noise_and_roadway_noise,Trickle_and_dribble,Walk_and_footsteps,Water_tap_and_faucet,Waves_and_surf,Whispering,Writing,Yell,Zipper_(clothing)
0,a3c8a463.wav,Cheering,0.0006,0.001,0.0003,0.0002,0.0016,0.0,0.0,0.0994,...,0.0087,0.0007,0.0,0.0058,0.0167,0.1447,0.0331,0.0011,0.0,0.0028
1,a3c8a463.wav,Car,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0001,0.0,0.0,0.0,0.9886,0.0,0.0,0.0,0.0
2,a3c8a463.wav,Bicycle,0.0027,0.0,0.0001,0.0001,0.0021,0.0,0.0,0.0775,...,0.0029,0.0001,0.0,0.0034,0.0206,0.0004,0.0477,0.0033,0.0,0.0169
3,a3c8a463.wav,Zipper,0.001,0.0,0.0001,0.0,0.0003,0.0,0.0,0.0916,...,0.0015,0.0001,0.0001,0.0043,0.0474,0.002,0.0078,0.0002,0.0,0.0034
4,a3c8a463.wav,Run,0.0008,0.0,0.0001,0.0,0.0014,0.0,0.0,0.0151,...,0.0004,0.0006,0.0001,0.0126,0.0016,0.0564,0.2027,0.0017,0.0,0.0143


In [None]:
import os
curated_results_df.to_csv('curated_results.csv', index=False)
noisy_results_df.to_csv('noisy_results.csv', index=False)