<a href="https://colab.research.google.com/github/nhut-ngnn/Practice-Graph_Neural_Network/blob/main/Demonstrate_Feature_Fusion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import librosa
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

In [13]:
# CBAM Module
class CBAM(nn.Module):
    def __init__(self, channels, reduction=16):
        super(CBAM, self).__init__()
        self.channel_attention = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            # The input channels to this layer should match the channels of the input tensor
            nn.Conv2d(channels, channels // reduction, 1, bias=False),
            nn.ReLU(),
            nn.Conv2d(channels // reduction, channels, 1, bias=False), # Also adjust output channels here
            nn.Sigmoid()
        )
        self.spatial_attention = nn.Sequential(
            nn.Conv2d(2, 1, kernel_size=7, padding=3, bias=False),
            nn.Sigmoid()
        )

    def forward(self, x):
        # Channel Attention
        ca = self.channel_attention(x)
        x = x * ca

        # Spatial Attention
        avg_out = torch.mean(x, dim=1, keepdim=True)
        max_out, _ = torch.max(x, dim=1, keepdim=True)
        sa = self.spatial_attention(torch.cat([avg_out, max_out], dim=1))
        x = x * sa
        return x

class AlexNetCBAM(nn.Module):
    def __init__(self, num_classes=10):
        super(AlexNetCBAM, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            CBAM(64),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            CBAM(192),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            CBAM(384),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            CBAM(256),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            CBAM(256),
        )
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes),
        )

    def forward(self, x):
        print(f"Input shape: {x.shape}")
        x = self.features[0](x)
        print(f"After first Conv2d: {x.shape}")
        x = self.features[1](x)
        x = self.features[2](x)
        print(f"After first MaxPool2d: {x.shape}")
        x = self.features[3](x)
        print(f"After first CBAM: {x.shape}")
        x = self.features[4](x)
        print(f"After second Conv2d: {x.shape}")
        x = self.features[5](x)
        x = self.features[6](x)
        print(f"After second MaxPool2d: {x.shape}")
        x = self.features[7](x)
        print(f"After second CBAM: {x.shape}")
        x = self.features[8](x)
        print(f"After third Conv2d: {x.shape}")
        x = self.features[9](x)
        print(f"After third CBAM: {x.shape}")
        x = self.features[10](x)
        print(f"After fourth Conv2d: {x.shape}")
        x = self.features[11](x)
        print(f"After fourth CBAM: {x.shape}")
        x = self.features[12](x)
        print(f"After fifth Conv2d: {x.shape}")
        x = self.features[13](x)
        x = self.features[14](x)
        print(f"After third MaxPool2d: {x.shape}")
        x = self.features[15](x)
        print(f"After fifth CBAM: {x.shape}")
        x = torch.flatten(x, 1)
        print(f"After flatten: {x.shape}")
        x = self.classifier(x)
        return x



In [25]:

def load_audio_file(file_path):
    y, sr = librosa.load(file_path, sr=None)  # Load the audio file with its native sampling rate
    return y, sr

def audio_to_melspectrogram(y, sr, n_mels=128):
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_spectrogram_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
    return mel_spectrogram_db

def extract_mfcc(y, sr, n_mfcc=13, n_mels=128, fmax=8000):
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, n_mels=n_mels, fmax=fmax)
    return mfccs

def preprocess_mfcc(mfccs):
    mfccs = (mfccs - np.mean(mfccs)) / np.std(mfccs)
    mfccs = np.expand_dims(mfccs, axis=0)
    mfccs = np.expand_dims(mfccs, axis=0)
    return torch.tensor(mfccs, dtype=torch.float32)

def preprocess_spectrogram(spectrogram):
    spectrogram = (spectrogram - np.mean(spectrogram)) / np.std(spectrogram)
    spectrogram = np.expand_dims(spectrogram, axis=0)
    spectrogram = np.expand_dims(spectrogram, axis=0)
    return torch.tensor(spectrogram, dtype=torch.float32)

file_path = '/content/Sound_Demo/common_voice_vi_21824030.mp3'
y, sr = load_audio_file(file_path)
mel_spectrogram_db = audio_to_melspectrogram(y, sr)
input_tensor = preprocess_spectrogram(mel_spectrogram_db)
model = AlexNetCBAM(num_classes=10)

output = model.features(input_tensor)
print(output.shape)

mfcc = extract_mfcc(y, sr)
input_tensor = preprocess_mfcc(mfcc)
print(input_tensor.shape)

torch.Size([1, 256, 3, 22])
torch.Size([1, 1, 13, 741])
