<a href="https://colab.research.google.com/github/masa512/audio_ML/blob/main/basic_audio_nn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Audio Deep Learning Basics

reference : https://pytorch.org/tutorials/intermediate/speech_command_classification_with_torchaudio_tutorial.html

the architecture : 
https://arxiv.org/pdf/1610.00087.pdf

## Dataloader generation

In [None]:
import torchaudio 
import torch
import os
import matplotlib.pyplot as plt
import torch.nn as nn

In [None]:
# Change device to cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# Load classification dataset class and randomly split dataset into 80% Train and 20% Test
SC_data = torchaudio.datasets.SPEECHCOMMANDS('.', download=True)

# The partitioning
N_total = SC_data.__len__()
N_train = int(N_total*0.8)
N_test = N_total-N_train

train_data, test_data = torch.utils.data.random_split(SC_data,[N_train,N_test])


In [None]:
# Now the train, test dataloaders

train_loader = torch.utils.data.DataLoader(dataset = train_data,
                                           batch_size=4,
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset = test_data,
                                           batch_size=4,
                                           shuffle=False)

In [None]:
# We will look at the data content in detail

'''
train_data

1. audio (X)
2. Sample rate
3. Classification (Y)
4. Speaker ID
5. Utterance number
'''

train_data[0] 


## Observe the Data a bit more visually

In [None]:
# List file names starting with char and is a folder

label_names = [name for name in os.listdir("./SpeechCommands/speech_commands_v0.02") if name[0].isalpha() and os.path.isdir(os.path.join('./SpeechCommands/speech_commands_v0.02',name))]

# The number labels for this dataset is shown below
print(len(label_names))

# Read number of audios per label
label_cnt = [len(os.listdir(os.path.join("./SpeechCommands/speech_commands_v0.02",label))) for label in label_names]



In [None]:
# Visualize the count using pie chart 
plt.figure(figsize =(10,10))
plt.pie(label_cnt,labels = label_names)
plt.title('Label Distribution')
plt.show()

## The Model : M5

The model is based on elementary conv network


In [None]:
class conv_module(nn.Module):
  # The chain of 1D Conv -> BN -> Relu

  def __init__(self,in_channel,out_channel,kernel_size,stride):
    super().__init__()

    self.conv = nn.Conv1d(in_channel, out_channel, kernel_size, stride = 1)
    self.bn = nn.BatchNorm1d(out_channel)
    self.relu = nn.ReLU()
  
  def forward(self,x):
    x = self.conv(x)
    x = self.bn(x)
    return self.relu(x)

class M5_model(nn.Module):
  
  def __init__(self,in_channel,base_channel,n_class):
    super().__init__()
    self.conv1 = conv_module(in_channel,base_channel,kernel_size=80, stride = 16)
    self.conv2 = conv_module(base_channel,base_channel,kernel_size=3)
    self.conv3 = conv_module(base_channel,2 * base_channel,kernel_size=3)
    self.conv4 = conv_module(2 * base_channel,2 * base_channel,kernel_size=3)

    self.pool = nn.MaxPool1d(4)
    self.fc1 = nn.Linear(2 * base_channel, n_class)


  def forward(self,x):

    x = self.conv1(x)
    x = self.pool(x)

    x = self.conv2(x)
    x = self.pool(x)

    x = self.conv3(x)
    x = self.pool(x)

    x = self.conv4(x)
    x = self.pool(x)

    # The FC layer at the end
    


    


    
