## Getting the data

In [1]:
import os
from pathlib import Path

cwd = os.getcwd()
project_dir = Path(os.getcwd())
zip_data_file = project_dir / "data" / "fma_small.zip"
os.chdir("data")
if not zip_data_file.exists():
    os.system("wget https://os.unil.cloud.switch.ch/fma/fma_small.zip")
# wget.download("https://os.unil.cloud.switch.ch/fma/fma_small.zip",)# zip_data_file)
# if "data/fma_small.zip
# ! wget https://os.unil.cloud.switch.ch/fma/fma_small.zip
os.chdir(project_dir)

In [2]:
os.chdir(project_dir / "data")
for i in range(30):
    data_dir = f"{i:03d}"
    foder_name = f"fma_small/{data_dir}"
    if not Path(foder_name).exists():
        # print(f"unzip fma_small.zip {foder_name}")
        os.system(f"unzip fma_small.zip {foder_name}/*")
os.chdir(project_dir)

## Building or retrieving the Corpus
First execution may require tenth of minutes

In [3]:
from source.ListDictCorpus import ListDictCorpus
from source.test_utils import select_random_song, get_first_30_ld_corpora
from source.Corpus import find_song


In [4]:
ld_corpus: ListDictCorpus = get_first_30_ld_corpora(
    skip_existing=True,
    verbose=1,
    fanout_window=10,
    spec_window_size=1024,
    spec_window_overlap_ratio=0.7,
)

## Testing Accuracy

In [5]:
from tqdm.notebook import tqdm
import numpy as np

### 30 folders corpus

Investigating the corpus

In [6]:
print(ld_corpus.info())


    number of songs: 1242
    number of hashes: 1893567
    average anchor point per hash: 5.839939648293406
    max anchor point per hash: 335
    min anchor point per hash: 1



In [7]:
n_test = 100
wrong = 0
not_found = 0
for _ in tqdm(range(n_test)):
    song = select_random_song()
    retrieved, _, _ = find_song(Path("..") / song, corpus=ld_corpus, verbose=False)
    if retrieved is None:
        not_found += 1
        print(f"{song.name} not found")
        continue
    if song.name != retrieved.name:
        wrong += 1
        print(f"{song.name} incorrect retrieved")
print(f"Accuracy: {1 - (wrong + not_found) / n_test}")

  0%|          | 0/100 [00:00<?, ?it/s]

000615.mp3 not found
Accuracy: 0.99


### Single Folder (faster test)

In [8]:
corpus_0 = ListDictCorpus(
    fanout_window=10,
    spec_window_size=1024,
    spec_window_overlap_ratio=0.7)

In [9]:
data_dir = project_dir / "data/fma_small/000"
for song in tqdm(os.listdir(data_dir)):
    corpus_0.add_song(data_dir / song)

  0%|          | 0/62 [00:00<?, ?it/s]

In [10]:
print(corpus_0.info())


    number of songs: 62
    number of hashes: 467129
    average anchor point per hash: 1.330733052326017
    max anchor point per hash: 29
    min anchor point per hash: 1



In [11]:
n_test = 100
wrong = not_found = 0
for _ in tqdm(range(n_test)):
    data_dir = project_dir / "data/fma_small/000"
    song = data_dir / np.random.choice(os.listdir(data_dir))
    retrieved, _, _ = find_song(song, corpus_0)
    # if retrieved is None:
    #     not_found += 1
    #     print(f"{song.name} not found")
    if song.name != retrieved.name:
        wrong += 0
        print(f"{song.name} incorrect retrieved")
print(f"Accuracy: {1 - (wrong + not_found) / n_test}")

  0%|          | 0/100 [00:00<?, ?it/s]

Accuracy: 1.0


## Retrieval time

In [12]:
print(ld_corpus.info())


    number of songs: 1242
    number of hashes: 1893567
    average anchor point per hash: 5.839939648293406
    max anchor point per hash: 335
    min anchor point per hash: 1



In [14]:
%%timeit
find_song(project_dir / "data/fma_small/000/000190.mp3", ld_corpus)

126 ms ± 2.12 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [15]:
print(corpus_0.info())


    number of songs: 62
    number of hashes: 467129
    average anchor point per hash: 1.330733052326017
    max anchor point per hash: 29
    min anchor point per hash: 1



In [16]:
%%timeit
find_song(project_dir / "data/fma_small/000/000190.mp3", corpus_0)

116 ms ± 307 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Test for Noise

In [17]:
from source.load_utils import load_mp3
from numpy import random

rng = random.default_rng()

In [18]:
def find_random_song_with_noise(std_ratio=0.1, corpus=ld_corpus):
    song = select_random_song()
    signal, sr = load_mp3(song)
    mean, std = np.mean(signal), np.std(signal)
    ld_corpus.recognize(signal, sr)
    noise = rng.normal(scale=std_ratio * std, size=signal.shape)
    signal += noise
    return *corpus.recognize(signal, sr), song



In [19]:
song = select_random_song()

signal, sr = load_mp3(song)
mean, std = np.mean(signal), np.std(signal)
ld_corpus.recognize(signal, sr)

(PosixPath('/home/mb/IdeaProjects/music-ir/data/fma_small/027/027258.mp3'),
 1.0253240880313537,
 0)

### Noise_std = 0.1 signal_std

In [22]:
n_test = 500
wrong = 0
not_found = 0
noise_ratio = 0.1
for _ in tqdm(range(n_test)):
    retrieved, *_, song = find_random_song_with_noise()
    if retrieved is None:
        not_found += 1
        print(f"{song.name} not found")
        continue
    if song.name != retrieved.name:
        wrong += 1
        print(f"{song.name} incorrect retrieved")
print(f"Accuracy: {1 - (wrong + not_found) / n_test}")

  0%|          | 0/500 [00:00<?, ?it/s]

027797.mp3 not found
027797.mp3 not found
028274.mp3 not found
025215.mp3 not found
025032.mp3 not found
025216.mp3 not found
030196.mp3 not found
024746.mp3 not found
014570.mp3 not found
025232.mp3 not found
021058.mp3 not found
025033.mp3 not found
025215.mp3 not found
010382.mp3 not found
025232.mp3 not found
025227.mp3 not found
025029.mp3 not found
025233.mp3 not found
025033.mp3 not found
027797.mp3 not found
001197.mp3 not found
025234.mp3 not found
025232.mp3 not found
025232.mp3 not found
019422.mp3 not found
Accuracy: 0.95


In [23]:
n_test = 500
wrong = 0
not_found = 0
noise_ratio = 0.5
for _ in tqdm(range(n_test)):
    retrieved, *_, song = find_random_song_with_noise(std_ratio=noise_ratio)
    if retrieved is None:
        not_found += 1
        print(f"{song.name} not found")
        continue
    if song.name != retrieved.name:
        wrong += 1
        print(f"{song.name} incorrect retrieved")
print(f"Accuracy: {1 - (wrong + not_found) / n_test}")

  0%|          | 0/500 [00:00<?, ?it/s]

024217.mp3 not found
015540.mp3 not found
010676.mp3 not found
027797.mp3 not found
010668.mp3 not found
025796.mp3 not found
017462.mp3 not found
002096.mp3 not found
007713.mp3 not found
005940.mp3 not found
026657.mp3 not found
008357.mp3 not found
002097.mp3 not found
005159.mp3 not found
008345.mp3 not found
026639.mp3 not found
006407.mp3 not found
028571.mp3 not found
009155.mp3 not found
025215.mp3 not found
002096.mp3 not found
008357.mp3 not found
026307.mp3 not found
007487.mp3 not found
003912.mp3 not found
016354.mp3 not found
019412.mp3 not found
004037.mp3 not found
023016.mp3 not found
002096.mp3 not found
002096.mp3 not found
001195.mp3 not found
010382.mp3 not found
007711.mp3 not found
024431.mp3 not found
001082.mp3 not found
029041.mp3 not found
024216.mp3 not found
024422.mp3 not found
015475.mp3 not found
010375.mp3 not found
028548.mp3 not found
023155.mp3 not found
025032.mp3 not found
022150.mp3 not found
014570.mp3 not found
017573.mp3 not found
030702.mp3 no