In [6]:
# Imports and Setup
import os
import numpy as np
import torch
import torchaudio
import torch.nn.functional as F
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import tqdm
from dotenv import load_dotenv

# Load environment variables (if you have any)
load_dotenv()

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Constants for audio processing
SAMPLE_RATE = 16000
N_MELS = 128
WINDOW_SIZE = 25  # in milliseconds
HOP_SIZE = 10  # in milliseconds
CHUNK_SIZE = 512  # reduced from 1024 to better fit RTX 2070 memory

# Hyperparameters
BATCH_SIZE = 8  # reduced from 16 to fit in GPU memory
LEARNING_RATE = 1e-4
NUM_EPOCHS = 2
D_MODEL = 128  # reduced from 256
NHEAD = 4  # reduced from 8
NUM_LAYERS = 2  # reduced from 4
DIM_FEEDFORWARD = 256  # reduced from 512

# Easy way to change hyperparameters
HYPERPARAMS = {
    'batch_size': BATCH_SIZE,
    'learning_rate': LEARNING_RATE,
    'num_epochs': NUM_EPOCHS,
    'd_model': D_MODEL,
    'nhead': NHEAD,
    'num_layers': NUM_LAYERS,
    'dim_feedforward': DIM_FEEDFORWARD
}

Using device: cuda


In [7]:
# Audio preprocessing function
def preprocess_audio(file_path):
    waveform, original_sample_rate = torchaudio.load(file_path)
    waveform = torchaudio.transforms.Resample(orig_freq=original_sample_rate, new_freq=SAMPLE_RATE)(waveform)
    waveform = torch.mean(waveform, dim=0, keepdim=True)  # Convert to mono
    mel_spec = torchaudio.transforms.MelSpectrogram(
        sample_rate=SAMPLE_RATE, 
        n_mels=N_MELS, 
        win_length=int(WINDOW_SIZE / 1000 * SAMPLE_RATE), 
        hop_length=int(HOP_SIZE / 1000 * SAMPLE_RATE)
    )(waveform)
    log_mel_spec = torchaudio.transforms.AmplitudeToDB()(mel_spec)
    return log_mel_spec.squeeze().numpy()

# Dataset class for loading audio files
class AudioDataset(Dataset):
    def __init__(self, file_paths):
        self.file_paths = file_paths

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        log_mel_spec = preprocess_audio(file_path)
        return log_mel_spec

# Custom collate function to handle variable length audio chunks
def collate_fn(batch):
    batch = [torch.tensor(item).float() for item in batch]
    batch = nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=0)
    return batch

# Load data
file_paths = [os.path.join('./previews', f) for f in os.listdir('./previews') if f.endswith('.mp3')]
dataset = AudioDataset(file_paths)
dataloader = DataLoader(dataset, batch_size=HYPERPARAMS['batch_size'], shuffle=True, collate_fn=collate_fn)

# Print dataset info
print(f"Total number of audio files: {len(dataset)}")
print(f"Number of batches: {len(dataloader)}")

Total number of audio files: 651
Number of batches: 82


In [8]:
# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x

# Model architecture
class AudioEmbeddingModel(nn.Module):
    def __init__(self, n_mels, d_model, nhead, num_layers, dim_feedforward, max_len=5000):
        super(AudioEmbeddingModel, self).__init__()
        self.conv1 = nn.Conv1d(n_mels, d_model, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(d_model, d_model, kernel_size=3, stride=2, padding=1)
        self.pos_encoder = PositionalEncoding(d_model, max_len)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout=0.1)
        self.transformer_encoder = TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(d_model, 768)  # Output embedding dimension

    def forward(self, x):
        x = F.gelu(self.conv1(x))
        x = F.gelu(self.conv2(x))
        x = x.permute(2, 0, 1)  # (N, C, L) -> (L, N, C)
        x = self.pos_encoder(x)
        x = self.transformer_encoder(x)
        x = x.mean(dim=0)  # Global average pooling
        x = self.fc(x)
        return x

# Initialize the model
model = AudioEmbeddingModel(
    n_mels=N_MELS, 
    d_model=HYPERPARAMS['d_model'], 
    nhead=HYPERPARAMS['nhead'], 
    num_layers=HYPERPARAMS['num_layers'], 
    dim_feedforward=HYPERPARAMS['dim_feedforward']
).to(device)

# Print model summary
print(model)
print(f"Number of parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

AudioEmbeddingModel(
  (conv1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(2,), padding=(1,))
  (pos_encoder): PositionalEncoding()
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=256, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=256, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (fc): Linear(in_features=128, out_features=768, bias=True)
)
Number of param



In [9]:
# Loss function and optimizer
criterion = nn.CosineEmbeddingLoss()
optimizer = optim.Adam(model.parameters(), lr=HYPERPARAMS['learning_rate'])

# Training loop
def train_model(model, dataloader, criterion, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0
        for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            batch = batch.to(device)
            optimizer.zero_grad()
            
            # Split batch into two halves for contrastive learning
            half = batch.size(0) // 2
            embeddings = model(batch)
            emb1, emb2 = embeddings[:half], embeddings[half:]
            
            # Compute loss
            target = torch.ones(half).to(device)  # Positive pairs
            loss = criterion(emb1, emb2, target)
            
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        
        print(f"Epoch {epoch+1}, Loss: {epoch_loss/len(dataloader):.4f}")
    
    return model

# Train the model
trained_model = train_model(model, dataloader, criterion, optimizer, HYPERPARAMS['num_epochs'])

# Save the trained model
torch.save(trained_model.state_dict(), 'audio_embedding_model.pth')
print("Model saved successfully.")


Epoch 1/10:   0%|          | 0/82 [00:00<?, ?it/s]

Epoch 1, Loss: 0.0055


Epoch 2/10:   0%|          | 0/82 [00:00<?, ?it/s]

Epoch 2, Loss: 0.0004


Epoch 3/10:   0%|          | 0/82 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
def generate_embedding(model, audio_path):
    model.eval()
    with torch.no_grad():
        log_mel_spec = preprocess_audio(audio_path)
        log_mel_spec = torch.tensor(log_mel_spec).unsqueeze(0).float().to(device)
        embedding = model(log_mel_spec)
    return embedding.cpu().numpy()

# Example usage:
# new_audio_path = "path/to/new/audio.mp3"
# embedding = generate_embedding(trained_model, new_audio_path)
# print("Generated embedding shape:", embedding.shape)