In [None]:
import glob
import os
import random
import shutil

src_dir = '/home/melan/supervised-vs-SSL/data/fma_data/fma_small'
dst_dir = '/home/melan/supervised-vs-SSL/data/fma_data/1900MB_subset'
target_size = 1900 * 1024 * 1024  # 1900 MB in bytes

# Recursively find all .mp3 files
all_files = glob.glob(os.path.join(src_dir, '**', '*.mp3'), recursive=True)
random.shuffle(all_files)

selected = []
total_size = 0
for f in all_files:
    size = os.path.getsize(f)
    if total_size + size > target_size:
        break
    selected.append(f)
    total_size += size

os.makedirs(dst_dir, exist_ok=True)
for f in selected:
    shutil.copy2(f, dst_dir)

print(f"Moved {len(selected)} files totaling {total_size / (1024*1024):.2f} MB to {dst_dir}")

In [None]:
import glob
import os
import random
import shutil
from pydub import AudioSegment

# Paths
speech_dir = '/home/melan/supervised-vs-SSL/data/speech_data/speechcommands'
music_dir = '/home/melan/supervised-vs-SSL/data/fma_data/1900MB_subset'
dst_dir = '/home/melan/supervised-vs-SSL/data/preprocessed/500_wav_subset'
os.makedirs(dst_dir, exist_ok=True)

# Collect all wav and mp3 files
speech_files = glob.glob(os.path.join(speech_dir, '**', '*.wav'), recursive=True)
music_files = glob.glob(os.path.join(music_dir, '**', '*.mp3'), recursive=True)

# Randomly select 250 from each
speech_selected = random.sample(speech_files, 250)
music_selected = random.sample(music_files, 250)

# Combine and shuffle
all_selected = speech_selected + music_selected
random.shuffle(all_selected)

# Copy speech wav files and convert music mp3 files to wav
for f in all_selected:
    base = os.path.splitext(os.path.basename(f))[0]
    dst_path = os.path.join(dst_dir, f"{base}.wav")
    if f.endswith('.wav'):
        shutil.copy2(f, dst_path)
    elif f.endswith('.mp3'):
        audio = AudioSegment.from_mp3(f)
        audio.export(dst_path, format="wav")

print(f"Copied and converted {len(all_selected)} files to {dst_dir}")

In [None]:
import glob
import os
from pydub import AudioSegment

src_dir = '/home/melan/supervised-vs-SSL/data/fma_data/1900MB_subset'
dst_dir = '/home/melan/supervised-vs-SSL/data/fma_data/1900MB_subset_wav'
os.makedirs(dst_dir, exist_ok=True)

mp3_files = glob.glob(os.path.join(src_dir, '*.mp3'))

for mp3_path in mp3_files:
    base = os.path.splitext(os.path.basename(mp3_path))[0]
    wav_path = os.path.join(dst_dir, f"{base}.wav")
    try:
        audio = AudioSegment.from_mp3(mp3_path)
        audio.export(wav_path, format="wav")
        print(f"Converted: {mp3_path} -> {wav_path}")
    except Exception as e:
        print(f"Error converting {mp3_path}: {e}")

print(f"Finished converting {len(mp3_files)} files to {dst_dir}")

In [4]:
import os
import glob
import numpy as np
import soundfile as sf
import random

speech_dir = '/home/melan/supervised-vs-SSL/data/speech_data/speechcommands'
output_speech_dir = '/home/melan/supervised-vs-SSL/data/preprocessed/10s_speech'
os.makedirs(output_speech_dir, exist_ok=True)

target_sr = 16000
target_len = 10 * target_sr  # 10 seconds

# Group speech files by word
speech_files = glob.glob(os.path.join(speech_dir, '**', '*.wav'), recursive=True)
word_to_files = {}
for f in speech_files:
    word = os.path.basename(f).split('_')[0]
    word_to_files.setdefault(word, []).append(f)

# Concatenate 10 files per word, save as one 10s file
speech_out_paths = []
for word, files in word_to_files.items():
    random.shuffle(files)
    for i in range(0, len(files) - 9, 10):
        group = files[i:i+10]
        audio_concat = []
        for f in group:
            y, sr = sf.read(f)
            if len(y.shape) > 1:
                y = np.mean(y, axis=1)
            if sr != target_sr:
                import librosa
                y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
            # Pad/truncate to 1s
            if len(y) < target_sr:
                y = np.pad(y, (0, target_sr - len(y)), mode='constant')
            elif len(y) > target_sr:
                y = y[:target_sr]
            audio_concat.append(y)
        y_out = np.concatenate(audio_concat)
        out_name = f"{word}_{i//10:04d}.wav"
        out_path = os.path.join(output_speech_dir, out_name)
        sf.write(out_path, y_out, target_sr)
        speech_out_paths.append((out_path, word))
print(f"Created {len(speech_out_paths)} 10s speech files.")

Created 6459 10s speech files.


In [5]:
music_dir = '/home/melan/supervised-vs-SSL/data/fma_data/1900MB_subset_wav'
output_music_dir = '/home/melan/supervised-vs-SSL/data/preprocessed/10s_music'
os.makedirs(output_music_dir, exist_ok=True)

music_files = glob.glob(os.path.join(music_dir, '*.wav'))
music_out_paths = []
for f in music_files:
    y, sr = sf.read(f)
    if len(y.shape) > 1:
        y = np.mean(y, axis=1)
    if sr != target_sr:
        import librosa
        y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
    total_len = len(y)
    num_chunks = total_len // target_len
    for i in range(num_chunks):
        chunk = y[i*target_len:(i+1)*target_len]
        out_name = f"{os.path.splitext(os.path.basename(f))[0]}_{i:04d}.wav"
        out_path = os.path.join(output_music_dir, out_name)
        sf.write(out_path, chunk, target_sr)
        music_out_paths.append((out_path, 'music'))
print(f"Created {len(music_out_paths)} 10s music files.")

Created 4842 10s music files.


In [None]:
import csv

label_csv_path = '/home/melan/supervised-vs-SSL/data/preprocessed/10s_labels_mapping.csv'
with open(label_csv_path, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['file_path', 'label'])
    for path, label in speech_out_paths + music_out_paths:
        writer.writerow([path, label])
print(f"Saved label CSV to {label_csv_path}")

Saved label CSV to /home/melan/supervised-vs-SSL/data/preprocessed/10s_labels_mapping.csv


In [7]:
import os
import glob
import numpy as np
import soundfile as sf

music_dir = '/home/melan/supervised-vs-SSL/data/fma_data/1900MB_subset_wav'
output_music_dir = '/home/melan/supervised-vs-SSL/data/preprocessed/10s_music'
os.makedirs(output_music_dir, exist_ok=True)

target_sr = 16000
target_len = 10 * target_sr  # 10 seconds

music_files = glob.glob(os.path.join(music_dir, '*.wav'))
music_out_paths = []
for f in music_files:
    y, sr = sf.read(f)
    if len(y.shape) > 1:
        y = np.mean(y, axis=1)
    if sr != target_sr:
        import librosa
        y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
    total_len = len(y)
    num_chunks = total_len // target_len
    for i in range(num_chunks):
        chunk = y[i*target_len:(i+1)*target_len]
        out_name = f"{os.path.splitext(os.path.basename(f))[0]}_{i:04d}.wav"
        out_path = os.path.join(output_music_dir, out_name)
        sf.write(out_path, chunk, target_sr)
        music_out_paths.append(out_path)
print(f"Created {len(music_out_paths)} 10s music files.")

KeyboardInterrupt: 

In [None]:
import pandas as pd

music_metadata_path = '/home/melan/supervised-vs-SSL/data/fma_data/fma_metadata/tracks.csv'
genre_map_path = '/home/melan/supervised-vs-SSL/data/fma_data/fma_metadata/raw_genres.csv'

music_metadata = pd.read_csv(music_metadata_path, index_col=0, header=[0, 1])
genre_map = pd.read_csv(genre_map_path, index_col='genre_id')['genre_title'].to_dict()

def get_genre_label(filename):
    # FMA track id is the first 6 digits of the filename
    base = os.path.basename(filename)
    track_id_str = base[:6]
    try:
        track_id = int(track_id_str)
    except ValueError:
        return 'unknown'
    if track_id in music_metadata.index:
        genres_str = music_metadata.loc[track_id, ('track', 'genres')]
        genre_ids = eval(genres_str) if pd.notnull(genres_str) else []
        genres = [genre_map.get(gid, 'unknown') for gid in genre_ids]
        return '|'.join(genres) if genres else 'unknown'
    else:
        return 'unknown'

music_labeled = [(path, get_genre_label(path)) for path in music_out_paths]

import csv

label_csv_path = '/home/melan/supervised-vs-SSL/data/preprocessed/10s_labels_mapping.csv'
with open(label_csv_path, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['file_path', 'label'])
    # speech_out_paths is [(path, word)]
    for path, label in speech_out_paths:
        writer.writerow([path, label])
    # music_labeled is [(path, genre_label)]
    for path, label in music_labeled:
        writer.writerow([path, label])
print(f"Saved label CSV to {label_csv_path}")

In [9]:
import csv

label_csv_path = '/home/melan/supervised-vs-SSL/data/preprocessed/10s_labels_mapping.csv'
with open(label_csv_path, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['file_path', 'label'])
    # speech_out_paths is [(path, word)]
    for path, label in speech_out_paths:
        writer.writerow([path, label])
    # music_labeled is [(path, genre_label)]
    for path, label in music_labeled:
        writer.writerow([path, label])
print(f"Saved label CSV to {label_csv_path}")

Saved label CSV to /home/melan/supervised-vs-SSL/data/preprocessed/10s_labels_mapping.csv


In [None]:
import os
import pandas as pd
import glob
import csv

# --- Paths ---
speech_dir = '/home/melan/supervised-vs-SSL/data/preprocessed/noisy_passt_speech'
music_dir = '/home/melan/supervised-vs-SSL/data/preprocessed/noisy_passt_music'
music_metadata_path = '/home/melan/supervised-vs-SSL/data/fma_data/fma_metadata/tracks.csv'
genre_map_path = '/home/melan/supervised-vs-SSL/data/fma_data/fma_metadata/raw_genres.csv'
output_csv_path = '/home/melan/supervised-vs-SSL/data/passt_labels_mapping.csv'  # <-- Change this to wherever you want

# --- Load metadata ---
music_metadata = pd.read_csv(music_metadata_path, index_col=0, header=[0, 1])
genre_map = pd.read_csv(genre_map_path, index_col='genre_id')['genre_title'].to_dict()

# --- Collect all .npy files ---
speech_files = glob.glob(os.path.join(speech_dir, '**', '*.npy'), recursive=True)
music_files = glob.glob(os.path.join(music_dir, '**', '*.npy'), recursive=True)
all_files = speech_files + music_files

# --- Write CSV ---
with open(output_csv_path, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['file_path', 'label'])  # header

    for idx, spec_path in enumerate(all_files):
        fname = os.path.basename(spec_path)
        if spec_path.startswith(speech_dir):
            label = fname.split('_')[0]
        elif spec_path.startswith(music_dir):
            track_id_str = fname[3:6]
            try:
                track_id = int(track_id_str)
            except ValueError:
                track_id = None
            if track_id is not None and track_id in music_metadata.index:
                genres_str = music_metadata.loc[track_id, ('track', 'genres')]
                genre_ids = eval(genres_str) if pd.notnull(genres_str) else []
                genres = [genre_map.get(gid, 'unknown') for gid in genre_ids]
            else:
                genres = []
            label = '|'.join(genres)  # join multiple genres as a string
        else:
            label = 'unknown'
        writer.writerow([spec_path, label])
        if idx < 5 or idx % 100 == 0:
            print(f"Processed {idx+1} files...", flush=True)

print(f"Labels CSV saved to: {output_csv_path}")

In [1]:
ulimit -n

NameError: name 'ulimit' is not defined

In [3]:
import os
import numpy as np
import pandas as pd

# Paths
label_csv = 'passt_labels_mapping.csv'  # Your CSV with file_path and label
# If you want to use both folders directly:
speech_folder = '/noisy_passt_speech/'
music_folder = '/noisy_passt_music/'

# Load label CSV
df = pd.read_csv(label_csv)

# Prepare lists for features and labels
X = []
y = []

for idx, row in df.iterrows():
    npy_path = row['file_path']
    label = row['label']
    if not os.path.exists(npy_path):
        print(f"Missing file: {npy_path}")
        continue
    spec = np.load(npy_path)
    X.append(spec.flatten())  # Flatten for classifier; for PaSST you may want to keep original shape
    y.append(label)
    if idx < 5:
        print(f"Loaded {npy_path}, label: {label}")

X = np.array(X)
y = np.array(y)
print(f"Loaded {len(X)} samples.")

# Now you can train a classifier, or pass X/y to your fine-tuning script

ValueError: Cannot load file containing pickled data when allow_pickle=False

In [None]:
import os
import numpy as np

features_dir = '/home/melan/supervised-vs-SSL/data/results/byola/final_embeddings'
embedding_files = sorted([f for f in os.listdir(features_dir) if f.endswith('.npy')])

X = []
file_basenames = []
for i, emb_file in enumerate(embedding_files):
    emb_path = os.path.join(features_dir, emb_file)
    try:
        arr = np.load(emb_path)
        print(f"[{i+1}/{len(embedding_files)}] Loaded {emb_file}, shape: {arr.shape}")
        X.append(arr.flatten())
        file_basenames.append(os.path.splitext(emb_file)[0])
    except Exception as e:
        print(f"Error loading {emb_file}: {e}")

print("Finished loading all files. Now converting to numpy array...")
try:
    X = np.array(X)
    print(f"X shape: {X.shape}")
except Exception as e:
    print(f"Error converting X to numpy array: {e}")

Trying to load 000140_0000_bus01_12.wav.npy (1/55563)
Loaded 000140_0000_bus01_12.wav.npy, shape: (1, 24, 3072)
Trying to load 000140_0001_bus02_12.wav.npy (2/55563)
Loaded 000140_0001_bus02_12.wav.npy, shape: (1, 24, 3072)
Trying to load 000140_0002_bus03_12.wav.npy (3/55563)
Loaded 000140_0002_bus03_12.wav.npy, shape: (1, 24, 3072)
Trying to load 000140_0003_bus04_12.wav.npy (4/55563)
Loaded 000140_0003_bus04_12.wav.npy, shape: (1, 24, 3072)
Trying to load 000140_0004_bus05_12.wav.npy (5/55563)
Loaded 000140_0004_bus05_12.wav.npy, shape: (1, 24, 3072)
Trying to load 000140_0005_bus06_12.wav.npy (6/55563)
Loaded 000140_0005_bus06_12.wav.npy, shape: (1, 24, 3072)
Trying to load 000140_0006_bus07_12.wav.npy (7/55563)
Loaded 000140_0006_bus07_12.wav.npy, shape: (1, 24, 3072)
Trying to load 000140_0007_bus08_12.wav.npy (8/55563)
Loaded 000140_0007_bus08_12.wav.npy, shape: (1, 24, 3072)
Trying to load 000140_0008_bus09_12.wav.npy (9/55563)
Loaded 000140_0008_bus09_12.wav.npy, shape: (1, 24

KeyboardInterrupt: 

In [6]:
import os
import numpy as np

features_dir = '/home/melan/supervised-vs-SSL/data/results/byola/final_embeddings'
embedding_files = sorted([f for f in os.listdir(features_dir) if f.endswith('.npy')])

batch_size = 1000
num_batches = len(embedding_files) // batch_size + 1

for batch_idx in range(num_batches):
    batch_files = embedding_files[batch_idx*batch_size : (batch_idx+1)*batch_size]
    X_batch = []
    file_basenames_batch = []
    for emb_file in batch_files:
        emb_path = os.path.join(features_dir, emb_file)
        try:
            arr = np.load(emb_path)
            X_batch.append(arr.flatten())
            file_basenames_batch.append(os.path.splitext(emb_file)[0])
        except Exception as e:
            print(f"Error loading {emb_file}: {e}")
    X_batch = np.array(X_batch)
    np.save(f'batch_{batch_idx}.npy', X_batch)
    print(f"Saved batch {batch_idx+1}/{num_batches}, batch size: {len(X_batch)}")

Saved batch 1/69, batch size: 1000
Saved batch 2/69, batch size: 1000
Saved batch 3/69, batch size: 1000
Saved batch 4/69, batch size: 1000
Saved batch 5/69, batch size: 1000
Saved batch 6/69, batch size: 1000
Saved batch 7/69, batch size: 1000
Saved batch 8/69, batch size: 1000
Saved batch 9/69, batch size: 1000
Saved batch 10/69, batch size: 1000
Saved batch 11/69, batch size: 1000
Saved batch 12/69, batch size: 1000
Saved batch 13/69, batch size: 1000
Saved batch 14/69, batch size: 1000
Saved batch 15/69, batch size: 1000
Saved batch 16/69, batch size: 1000
Saved batch 17/69, batch size: 1000
Saved batch 18/69, batch size: 1000
Saved batch 19/69, batch size: 1000
Saved batch 20/69, batch size: 1000
Saved batch 21/69, batch size: 1000
Saved batch 22/69, batch size: 1000
Saved batch 23/69, batch size: 1000
Saved batch 24/69, batch size: 1000
Saved batch 25/69, batch size: 1000
Saved batch 26/69, batch size: 1000
Saved batch 27/69, batch size: 1000
Saved batch 28/69, batch size: 1000
S

In [None]:
import os
import numpy as np
import sys
sys.path.append('/home/melan/supervised-vs-SSL/data/results/byola')

features_dir = '/final_embeddings_batches'
embedding_files = sorted([f for f in os.listdir(features_dir) if f.startswith('batch_') and f.endswith('.npy') and not f.endswith('_basenames.npy')])

for batch_file in embedding_files:
    batch_path = os.path.join(features_dir, batch_file)
    # Find all .npy files that went into this batch
    # If you have a record of which files went into each batch, use that.
    # Otherwise, try to reconstruct basenames from the batch file itself:
    # If you saved batches from a list of files, you should have saved basenames at batch creation.
    # If not, you need to reconstruct them. Here's a placeholder for how you might do it:
    # For example, if you have a list of original files per batch:
    # file_basenames_batch = [...]  # List of basenames for this batch
    # np.save(os.path.join(features_dir, batch_file.replace('.npy', '_basenames.npy')), file_basenames_batch)
    print(f"TODO: Save basenames for {batch_file} as {batch_file.replace('.npy', '_basenames.npy')}")

FileNotFoundError: [Errno 2] No such file or directory: '/home/melan/supervised-vs-SSL/data/results/byola/final_embeddings_batches'