In [1]:
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F
import os
from pydub import AudioSegment

import matplotlib.pyplot as plt
str(torchaudio.get_audio_backend())

'soundfile'

In [2]:
if torch.cuda.is_available():
    # Set the default device to the first available GPU (index 0)
    torch.cuda.set_device(0)
    device = torch.device('cuda')
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = torch.device('cpu')
    print("CUDA is not available. Using CPU.")

Using GPU: NVIDIA GeForce RTX 4070 Ti


In [3]:
# # convert all mp3 to wav
# genre_folder = r""
# i = 10
#
# for song in os.listdir(genre_folder):
#     song_path = os.path.join(genre_folder, song)
#     audio = AudioSegment.from_mp3(song_path)
#     wav_file_path = os.path.join(genre_folder, str(i) + ".wav")
#     audio.export(wav_file_path, format="wav")
#     i += 1

In [5]:
genre_to_i = {}
i_to_genre = {}
downsample_size = 2205 # get a sample every 441 samples, resulting 20 per second
sample_rate = 44100
segment_seconds = 5
input_size = sample_rate // downsample_size * segment_seconds
# target_size = sample_rate // downsample_size
X = []
Y_ = []

num_genres = 0
for genre in os.listdir("./assets"):
    genre_to_i[genre] = num_genres

    for song in os.listdir("./assets/" + genre):
        if song.endswith(".wav"):
            waveform = torchaudio.load(os.path.join("./assets", genre, song))[0][0] # get the left channel waveform
            for i in range(0, len(waveform) - sample_rate * segment_seconds, sample_rate * segment_seconds):
                one_sec_tensor = waveform[i:i+sample_rate * segment_seconds]
                one_sec_list = one_sec_tensor.tolist()[0::downsample_size]
                X.append(one_sec_list)
                Y_.append(num_genres)
    num_genres += 1

Y = torch.zeros((len(X), num_genres))
for j in range(len(Y_)):
    Y[j][Y_[j]] += 1
X = torch.tensor(X)

min_value = torch.min(X)
max_value = torch.max(X)

# Min-Max normalization
X = (X - min_value) / (max_value - min_value)

X = X.to(device)
Y = Y.to(device)

print("X shape:", X.shape)
print("Y shape:", Y.shape)

X shape: torch.Size([1620, 100])
Y shape: torch.Size([1620, 3])


In [6]:
X[150]

tensor([0.4025, 0.1984, 0.7995, 0.6840, 0.1680, 0.3593, 0.7504, 0.6914, 0.3243,
        0.3810, 0.6560, 0.5887, 0.0990, 0.2997, 0.6490, 0.5467, 0.3599, 0.3265,
        0.6706, 0.4675, 0.1607, 0.0752, 0.7577, 0.8506, 0.2810, 0.3931, 0.7567,
        0.5117, 0.0972, 0.5995, 0.8241, 0.5106, 0.1247, 0.1384, 0.1971, 0.6540,
        0.6284, 0.2963, 0.5506, 0.7104, 0.6693, 0.0976, 0.4305, 0.9094, 0.9499,
        0.7188, 0.6255, 0.3276, 0.5101, 0.9013, 0.2999, 0.3214, 0.8105, 0.7192,
        0.0977, 0.5321, 0.6833, 0.4536, 0.1146, 0.8937, 0.5682, 0.3498, 0.1145,
        0.6464, 0.8816, 0.2583, 0.2930, 0.5750, 0.7405, 0.1999, 0.4391, 0.6253,
        0.5212, 0.4453, 0.4285, 0.6184, 0.3595, 0.4540, 0.3355, 0.7640, 0.2221,
        0.0895, 0.7133, 0.7298, 0.1614, 0.4300, 0.1336, 0.4785, 0.8461, 0.7617,
        0.0394, 0.5169, 0.7287, 0.2033, 0.2666, 0.6567, 0.7365, 0.2060, 0.6162,
        0.8513], device='cuda:0')

In [7]:
# split the dataset
from torch.utils.data import random_split

# Assuming you have X and Y tensors with shapes:
# X shape: torch.Size([4914, 100])
# Y shape: torch.Size([4914, 3])

# Define the percentages for train, dev, and test sets
train_percent = 0.8
dev_percent = 0.1
test_percent = 0.1

# Calculate the number of samples for each split
num_samples = X.shape[0]
num_train_samples = int(num_samples * train_percent)
num_dev_samples = int(num_samples * dev_percent)
num_test_samples = num_samples - num_train_samples - num_dev_samples

# Create a dataset from X and Y
dataset = torch.utils.data.TensorDataset(X, Y)

# Split the dataset into train, dev, and test sets
train_dataset, dev_dataset, test_dataset = random_split(
    dataset, [num_train_samples, num_dev_samples, num_test_samples]
)

# Create data loaders for each split
batch_size = 32  # Set an appropriate batch size for training
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size)

# Check the number of samples in each split
print("Number of samples in Xtr, Ytr:", len(train_dataset))
print("Number of samples in Xdev, Ydev:", len(dev_dataset))
print("Number of samples in Xte, Yte:", len(test_dataset))

def extract_input_target(dataset):
    X = torch.stack([sample[0] for sample in dataset])
    Y = torch.stack([sample[1] for sample in dataset])
    return X, Y

# Extract input and target for training dataset
Xtr, Ytr = extract_input_target(train_dataset)

# Extract input and target for development (validation) dataset
Xdev, Ydev = extract_input_target(dev_dataset)

# Extract input and target for test dataset
Xte, Yte = extract_input_target(test_dataset)

Number of samples in Xtr, Ytr: 1296
Number of samples in Xdev, Ydev: 162
Number of samples in Xte, Yte: 162


In [8]:
# Implementation of Transformer
class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        assert (self.head_dim * heads == embed_size), "Embed size needs to be divisible by heads"

        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = nn.Linear(self.heads * self.head_dim, embed_size)

    def forward(self, values, keys, query, mask):
        N = query.shape[0]
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        values = values.reshape(N, value_len, self.heads, self.head_dim)