# Install packages

In [1]:
%%capture
%pip install librosa
%pip install pandas
%pip install numpy

In [2]:
import librosa
import numpy as np

In [None]:
import torch
import torch.nn as nn
from transformers import EfficientNetImageProcessor, EfficientNetForImageClassification, EfficientNetConfig
import torchvision.models as models

# Template functions

In [3]:
def get_audio_time_series(filename:str):
    '''
    Reads a mp3 file

    Returns a tuple containing y, the audio time series, and sr, the sampling rate of y
    '''
    try:
        y, sr = librosa.load(filename)
        return y, sr
    except:
        pass

In [5]:
filename_template = 'mp3_files/00pcolwO8c6vOxOUwpZ0QM.mp3'
y, sr = get_audio_time_series(filename = filename_template)

# Utils

# Dataloader

# EfficientNet_Audio

In [None]:
class Audio_Effnet(nn.Module):
    def __init__(self, num_classes,train = True):

        ''' We use a pretrained model. The backbone of the model will be the efficientnet,then we take out the classification layers and add a linear layer 
        at the end.'''

        super(Audio_Effnet, self).__init__()

        self.backbone =  models.efficientnet_b0(weights='EfficientNet_B0_Weights.DEFAULT',include_top=False)
        print('Backbone loaded')
        # Freeze the layer of the pre-trained model except the last one
        print('[INFO]: Freezing hidden layers...')
        if train == True :
            for param in self.backbone.parameters():
                param.requires_grad = True
                print('[INFO]: Hidden layers Not Freezed')
                
        else:
            for param in self.backbone.parameters():
                param.requires_grad = False
                print('[INFO]: Hidden layers Freezed')
        
        # To identify the number of in features, it is necessary to look at the shape of x before the step self.fc(x)
        
    
        self.classifier = nn.Sequential(
            nn.Linear(in_features = 196, out_features = num_classes, bias=True), 
            nn.Softmax(dim=1)  # Softmax activation for classification
        )



        
    def forward(self, x):      
        # separately pass five different time steps (2 in each direction), each again stacked with three channels
        outputs = []
        #print('Before cat :',x.shape)
        batch_size, timesteps, _, _,_ = x.size()
        for i in range(5):
            temp = self.backbone.features(x[:,i,:,:,:])
            outputs.append(temp)  
       
        stacked_output = torch.stack(outputs, dim=1)
        # Print x to identify its structure

        reshaped_features = self.conv3D(stacked_output)   

        
        summarized_tensor = reshaped_features.mean(dim=1)
        output = self.classifier(summarized_tensor )
        

        return output
