# 2

In [59]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
import torchaudio
import torchvision.transforms as transforms
from datasets import load_dataset, Audio
import numpy as np

# For reproducibility
torch.manual_seed(42)

class AudioCNN(nn.Module):
    def __init__(self):
        super(AudioCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(64 * 32 * 32, 128)  # Adjust this based on input size
        self.fc2 = nn.Linear(128, 3)
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = self.pool(torch.relu(self.conv3(x)))
        x = x.view(-1, 64 * 32 * 32)  # Flatten the tensor
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x


In [60]:
sampling_rate = 16000
# Load the dataset
dataset = load_dataset("audiofolder", data_dir="padded")

dataset = dataset.cast_column('audio', Audio(sampling_rate=sampling_rate))


Resolving data files:   0%|          | 0/96 [00:00<?, ?it/s]

Found cached dataset audiofolder (/home/budos/.cache/huggingface/datasets/audiofolder/default-44ba5f79a7b0ac12/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc)


  0%|          | 0/1 [00:00<?, ?it/s]

In [47]:
dataset['train']['audio'][0]['array']

array([0.00048205, 0.00092937, 0.00092832, ..., 0.        , 0.        ,
       0.        ])

In [61]:

# Example transformation: Convert waveform to spectrogram
def transform_audio(example):
    waveform = torch.tensor(example['audio']['array'], dtype=torch.float32)
    spectrogram = torchaudio.transforms.MelSpectrogram()(waveform)
    return {'spectrogram': spectrogram, 'label': example['label']}


In [62]:

# Apply the transformation
dataset = dataset.map(transform_audio)

# # Convert dataset to PyTorch tensors
# dataset.set_format(type='torch', columns=['spectrogram', 'label'])


Loading cached processed dataset at /home/budos/.cache/huggingface/datasets/audiofolder/default-44ba5f79a7b0ac12/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc/cache-45f66722699d5d5b.arrow


In [52]:
dataset['train']['audio'][0]

{'bytes': b'RIFF$\xe2\x04\x00WAVEfmt \x10\x00\x00\x00\x01\x00\x01\x00\x80>\x00\x00\x00}\x00\x00\x02\x00\x10\x00data\x00\xe2\x04\x00\x0f\x00\x1e\x00\x1e\x00\x1f\x00\x1d\x00&\x00\'\x00,\x00-\x00.\x002\x007\x00<\x00=\x00:\x00.\x00$\x00$\x00!\x00!\x00(\x00(\x00$\x00\x16\x00\x0e\x00\x0f\x00\x0e\x00\x07\x00\x05\x00\x05\x00\t\x00\x0e\x00\x0e\x00\x0e\x00\x10\x00\x0f\x00\t\x00\x0c\x00\x0e\x00\r\x00\x08\x00\x06\x00\x06\x00\x05\x00\x07\x00\n\x00\x07\x00\x06\x00\x08\x00\x0b\x00\t\x00\x07\x00\x05\x00\x00\x00\xf8\xff\xf1\xff\xf1\xff\xf3\xff\xf5\xff\xf9\xff\xfc\xff\xfe\xff\x00\x00\x04\x00\r\x00\x17\x00\x1a\x00\x14\x00\x10\x00\t\x00\t\x00\x08\x00\x06\x00\x0b\x00\x0c\x00\x10\x00\x14\x00\x15\x00\x18\x00\x14\x00\x15\x00 \x00\'\x00)\x00+\x00*\x00+\x00!\x00\x0f\x00\r\x00\r\x00\t\x00\x01\x00\xf8\xff\xf1\xff\xed\xff\xec\xff\xef\xff\xed\xff\xe8\xff\xe0\xff\xd9\xff\xd8\xff\xd0\xff\xca\xff\xcb\xff\xd1\xff\xd4\xff\xd2\xff\xd4\xff\xd2\xff\xcc\xff\xca\xff\xcd\xff\xd2\xff\xd8\xff\xe1\xff\xe2\xff\xe6\xff\xe6\xff\xe7

In [63]:

# Split dataset
train_size = int(0.8 * len(dataset['train']))
val_size = len(dataset['train']) - train_size
train_dataset, val_dataset = random_split(dataset['train'], [train_size, val_size])

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)


In [22]:
train_dataset[32]

{'label': tensor(1),
 'spectrogram': tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         [3.4594e-04, 1.2453e-04, 1.2268e-04,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         [1.8626e-03, 6.7050e-04, 6.6054e-04,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         ...,
         [1.6070e-06, 7.5460e-07, 7.5117e-07,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         [2.2947e-07, 1.9456e-07, 7.3602e-08,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00],
         [2.0491e-08, 1.0201e-08, 7.0944e-09,  ..., 0.0000e+00, 0.0000e+00,
          0.0000e+00]])}

In [55]:
type(train_loader)

torch.utils.data.dataloader.DataLoader

In [57]:
for inputs, labels in train_loader:
    type(inputs)

In [64]:

# Initialize model, loss function, and optimizer
model = AudioCNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        # Ensure inputs are in the correct shape and dtype
        inputs = inputs.unsqueeze(1)  # Add channel dimension
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}")

# Evaluation
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in val_loader:
        # Ensure inputs are in the correct shape and dtype
        inputs = inputs.unsqueeze(1)  # Add channel dimension
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy: {100 * correct / total}%')


TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'NoneType'>

In [58]:
# Initialize model, loss function, and optimizer
model = AudioCNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}")

# Evaluation
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in val_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy: {100 * correct / total}%')


TypeError: conv2d() received an invalid combination of arguments - got (str, Parameter, Parameter, tuple, tuple, tuple, int), but expected one of:
 * (Tensor input, Tensor weight, Tensor bias, tuple of ints stride, tuple of ints padding, tuple of ints dilation, int groups)
      didn't match because some of the arguments have invalid types: (!str!, !Parameter!, !Parameter!, !tuple of (int, int)!, !tuple of (int, int)!, !tuple of (int, int)!, int)
 * (Tensor input, Tensor weight, Tensor bias, tuple of ints stride, str padding, tuple of ints dilation, int groups)
      didn't match because some of the arguments have invalid types: (!str!, !Parameter!, !Parameter!, !tuple of (int, int)!, !tuple of (int, int)!, !tuple of (int, int)!, int)
