In [1]:
from torch.utils.data import Dataset,DataLoader
import librosa
import transformers
from tqdm import tqdm
from transformers import ASTForAudioClassification,ASTFeatureExtractor
import torch
import pandas as pd
from transformers import ASTConfig
import numpy as np
import os
import shutil
from multiprocessing import Pool
import random
from torch.optim import AdamW
from transformers import get_scheduler
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter


  from .autonotebook import tqdm as notebook_tqdm


In [12]:
SAMPLING_RATE=22050
MAX_LENGTH=1024
DURATION=10
SEGMENT=SAMPLING_RATE*DURATION
THREADS=os.cpu_count()
LEARNING_RATE = 1e-6
NUM_EPOCHS = 5 
DEVICE='cuda'

In [3]:
configuration = ASTConfig.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")
taxonomy=pd.read_csv('taxonomy.csv')
label_nums=taxonomy.shape[0]
id2label={}
label2id={}
for i in range(label_nums):
    id2label[i]=taxonomy['primary_label'][i]
for key,value in id2label.items():
    label2id[value]=key
configuration.id2label=id2label
configuration.label2id=label2id
configuration.max_length=MAX_LENGTH
del taxonomy

In [4]:
def split_num(file):
    y, _ = librosa.load(os.path.join('train_audio', file), sr=SAMPLING_RATE)
    return y.shape[0] // (SEGMENT) + 1

def random_audio_file(path='train_soundscapes'):
    files = [f for f in os.listdir(path) if f.endswith('.ogg')]
    return random.choice(files)

def tile_padding(arr):
    n = arr.shape[0]
    if n==0:return np.zeros(SEGMENT)
    if n==SEGMENT:return arr
    k = SEGMENT // n + 1
    padded_arr = np.tile(arr, k)
    return padded_arr[:SEGMENT]

def random_segment(arr):
    n = arr.shape[0]
    if n < SEGMENT:
        arr = tile_padding(arr)
    start_idx = np.random.randint(0, len(arr) - SEGMENT + 1)
    return arr[start_idx : start_idx + SEGMENT]

In [5]:
df=pd.read_csv('train.csv')
df['file']=df['filename'].str.split('/').str[-1]
for i in df.itertuples():
    src_path=os.path.join('train_audio',i.filename)
    if os.path.exists(src_path):
        dst_path=os.path.join('train_audio',i.file)
        if not os.path.exists(dst_path):
            shutil.move(src_path,dst_path)
        else:continue

df=df[['primary_label','file']]
#file_to_labels = df.groupby('file')['primary_label'].apply(list)
#file_to_labels = file_to_labels.apply(lambda x: list(dict.fromkeys(x)))
#max_labels = file_to_labels.apply(len).max()
#df = pd.DataFrame(file_to_labels.tolist(), index=file_to_labels.index)
#df.columns = [f'label{i}' for i in range(df.shape[1])]
#df = df.reset_index()


#files = df['file'].tolist()

#with Pool(THREADS) as p:
#    splitnums = p.map(split_num, files)
#df['splitnum'] = splitnums
#df['accumulate']=df['splitnum'].cumsum()

In [6]:
model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593",
                                                  attn_implementation="sdpa",
                                                  torch_dtype=torch.bfloat16,
                                                  ).to('cuda')
extractor=ASTFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593",
                                              sampling_rate=SAMPLING_RATE)
extractor.sampling_rate=SAMPLING_RATE
extractor.max_length=MAX_LENGTH
model.config=configuration
model.classifier.dense=nn.Linear(768,206,bias=True).to(DEVICE)
model.num_labels=206

In [7]:
class AstAudio(Dataset):
    def __init__(self):
        self.data=df
        #self.split=np.array(self.data['splitnum'])
        #self.cum=np.array(self.data['accumulate'])
    def __len__(self):
        #return self.data.iloc[-1].accumulate
        return len(self.data)
    #def searchid(self,index):
    #    id=self.cum.searchsorted(index)
    #    part_id=index-self.cum[id]+self.split[id]
    #    return id,part_id
    def __getitem__(self, index):
        #id,part_id=self.searchid(index)
        #audio_path=os.path.join('train_audio',self.data.iloc[id].file)
        audio_path=os.path.join('train_audio',self.data.iloc[index].file)
        wave,_=librosa.load(audio_path,sr=SAMPLING_RATE)
        wave=random_segment(wave)
        scape=os.path.join('train_soundscapes',random_audio_file())
        noise_wave,_=librosa.load(scape,sr=SAMPLING_RATE)
        noise_wave=random_segment(noise_wave)
        wave=0.7*wave+0.3*noise_wave
        #if part_id!=self.split[id]:
        #    wave=0.7*tile_padding(wave[(part_id-1)*SEGMENT:part_id*SEGMENT])+0.3*noise_wave
        #else:
        #    wave=0.7*tile_padding(wave[(part_id-1)*SEGMENT:])+0.3*noise_wave
        spectrogram=extractor(wave,return_tensors='pt',sampling_rate=SAMPLING_RATE)
        return spectrogram['input_values'],torch.tensor(label2id[self.data.iloc[id].label0])
b=AstAudio()
dataloader = DataLoader(
    b,
    batch_size=32,
    shuffle=True,
    num_workers=THREADS, 
    pin_memory=True
)



In [8]:
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
num_training_steps = NUM_EPOCHS * len(dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=int(0.1 * num_training_steps),
    num_training_steps=num_training_steps
)

In [11]:
writer = SummaryWriter(log_dir="./logs")
scaler = None
if model.dtype == torch.float16:
    scaler = torch.cuda.amp.GradScaler()

for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}")

    for batch_idx, (input_values, labels) in enumerate(progress_bar):
        input_values = input_values.to(DEVICE, non_blocking=True) 
        labels = labels.to(DEVICE, non_blocking=True)   
        if model.dtype == torch.bfloat16:
            input_values = input_values.to(torch.bfloat16)
        elif model.dtype == torch.float16:
            input_values = input_values.to(torch.float16)


        optimizer.zero_grad()

        if scaler:
            with torch.cuda.amp.autocast():
                outputs = model(input_values=input_values.squeeze(1), labels=labels)
                loss = outputs.loss
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        elif model.dtype == torch.bfloat16:
             with torch.cuda.amp.autocast(dtype=torch.bfloat16):
                outputs = model(input_values=input_values.squeeze(1), labels=labels)
                loss = outputs.loss
             loss.backward()
             optimizer.step()
        else: 
            outputs = model(input_values=input_values.squeeze(1), labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

        lr_scheduler.step() 

        total_loss += loss.item()
        progress_bar.set_postfix({'loss': f'{loss.item():.4f}', 'avg_loss': f'{total_loss / (batch_idx + 1):.4f}'})
        writer.add_scalar("Perbatch Loss", loss.item(), epoch * len(dataloader) + batch_idx)
    avg_train_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1} Average Training Loss: {avg_train_loss:.4f}")
    output_dir = f"./bird_ast{epoch}"
    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)
    extractor.save_pretrained(output_dir)
    print(f"Model and extractor saved to {output_dir}")






  with torch.cuda.amp.autocast(dtype=torch.bfloat16):
Epoch 1/5:  54%|█████▍    | 1944/3608 [1:02:49<1:53:07,  4.08s/it, loss=3.1989, avg_loss=2.2695]

In [10]:
model = ASTForAudioClassification.from_pretrained("bird_ast4",
                                                  attn_implementation="sdpa",
                                                  torch_dtype=torch.bfloat16,
                                                  ).to('cuda')
extractor=ASTFeatureExtractor.from_pretrained("bird_ast4",
                                              sampling_rate=SAMPLING_RATE)
