In [None]:
! pip install timm==0.4.5
! pip install wget
! pip install neptune-client
! pip install torchcontrib

In [23]:
import argparse
import os
import ast
import pickle
import sys
import time
import torch
import numpy as np
import pandas as pd
import json
import neptune.new as neptune
from typing import Dict, List, Union
import torchaudio
import random

from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from torch.utils.data import WeightedRandomSampler
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.cuda.amp import GradScaler, autocast
# from torch.utils.tensorboard import SummaryWriter
from torchcontrib.optim import SWA
basepath = os.path.dirname(os.path.dirname(sys.path[0]))
sys.path.append(basepath)

import dataloaderV1
from ast_models import ASTModel
from traintest import train, validate

In [14]:
tracker = neptune.init(
    project="nipdep/sp-cup",
    api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJkNWJjMDdhNC05NWY5LTQwNWQtYTQyNi0zNjNmYmYwZDg3M2YifQ==",
)  # your credentials

https://app.neptune.ai/nipdep/sp-cup/e/SPCUP-57
Remember to stop your run once you’ve finished logging your metadata (https://docs.neptune.ai/api-reference/run#.stop). It will be stopped automatically only when the notebook kernel/interactive console is terminated.


In [15]:
import warnings
warnings.filterwarnings('ignore')

File Operations

In [None]:
! wget "https://www.dropbox.com/s/36yqmymkva2bwdi/spcup_2022_training_part1.zip?dl=1" -c -O 'spcup_2022_training_part1.zip'
! wget "https://www.dropbox.com/s/wsmlthhri29fb79/spcup_2022_unseen.zip?dl=1" -c -O 'spcup_2022_unseen.zip'

!unzip "./spcup_2022_training_part1.zip" -d "./spcup_2022_training/"
!unzip "./spcup_2022_unseen.zip" -d "./spcup_2022_unseen/"

!rm "./spcup_2022_training_part1.zip"
!rm "./spcup_2022_unseen.zip"

# create combined label df
# combine two label sets
df1 = pd.read_csv('./spcup_2022_training/spcup_2022_training_part1/labels.csv')
df2 = pd.read_csv('./spcup_2022_unseen/spcup_2022_unseen/labels.csv')
df3 = pd.concat([df1, df2]).sample(frac=1)

df3.to_csv('./final_labels.csv', index=False)
!rm './spcup_2022_unseen/spcup_2022_unseen/labels.csv'

# copy .wav files from unseen to all
!cp -a "./spcup_2022_unseen/spcup_2022_unseen/". "./spcup_2022_training/spcup_2022_training_part1/"

! wget "https://www.dropbox.com/s/ftkyvwxgr9wl7jf/spcup_2022_eval_part1.zip?dl=1" -c -O "./spcup_2022_eval_part1.zip"
! unzip "./spcup_2022_eval_part1.zip" -d "./spcup_2022_eval_part1/"
!rm "./spcup_2022_eval_part1.zip"

JSON creation

In [8]:
DATA_PATH = "./spcup_2022_training/spcup_2022_training_part1/"
IMAGE_PATH = './images'
Labeled_dir = './final_labels.csv'

label_df = pd.read_csv(Labeled_dir)
label_df['wav_path'] = label_df['track'].map(lambda x: DATA_PATH+'/'+x)

spect_type = 'mel'
def save_and_record(wav_path):

    image_name = wav_path.split('/')[-1].split('.')[0]
    image_path = IMAGE_PATH+'/'+image_name+'.png'
    return image_path

label_df['image_path'] = label_df['wav_path'].map(lambda x: save_and_record(x))
label_df['track'] = label_df['track'].map(lambda x: x.split('.')[0])  

col_map = {
    'track': 'audio_id',
    'algorithm': 'labels',
    'wav_path': 'wav',
    'image_path': 'image'
}
label_df.rename(col_map, axis=1, inplace=True)

train_df, test_df = train_test_split(label_df, test_size=0.15)

train_data_dict = {'data': train_df.to_dict('records')}
with open("./final_train_data.json", "w") as outfile:
    json.dump(train_data_dict, outfile)

test_data_dict = {'data': test_df.to_dict('records')}
with open("./final_test_data.json", "w") as outfile:
    json.dump(test_data_dict, outfile)


In [3]:
test_df = pd.read_csv('./spcup_2022_eval_part1/spcup_2022_eval_part1/labels_eval_part1.csv')
test_df['track'] = test_df['track'].apply(lambda x: './spcup_2022_eval_part1/spcup_2022_eval_part1/'+x)
# test_df.head()


Model Training

In [33]:
args = argparse.Namespace()

args.data_train = './final_train_data.json'
args.data_val = './final_test_data.json'
args.data_test = test_df
args.data_eval = '.json'
args.n_class = 6
args.model = 'ast'
args.dataset = 'speechcommands'
args.exp_dir = '.'
args.lr = 0.001
args.optim = 'adam'
args.batch_size = 16
args.num_workers =32
args.n_epochs = 100
args.lr_patience = 2
args.n_print_steps = 100
args.save_model = None # 
args.freqm = 0
args.timem = 0
args.mixup = 0
args.bal = False
args.fstride = 10
args.tstride = 10
args.imagenet_pretrain = True
args.audioset_pretrain = False
args.noise_level = 0.1
args.optim_config = {
        "optimizer": "adam", 
        "amsgrad": "False",
        "base_lr": 0.0001,
        "lr_min": 0.000005,
        "betas": [0.9, 0.999],
        "weight_decay": 0.0001,
        "scheduler": "cosine"
    }

In [25]:
tracker["parameters"] = vars(args)

In [26]:
# dataset spectrogram mean and std, used to normalize the input
norm_stats = {'audioset':[-4.2677393, 4.5689974], 'esc50':[-6.6268077, 5.358466], 'speechcommands':[-6.845978, 5.5654526]}
target_length = {'audioset':1024, 'esc50':512, 'speechcommands':128}
# if add noise for data augmentation, only use for speech commands
noise = {'audioset': False, 'esc50': False, 'speechcommands':True}

audio_conf = {'num_mel_bins': 128, 'target_length': target_length[args.dataset], 'freqm': args.freqm, 'timem': args.timem, 'mixup': args.mixup, 'dataset': args.dataset, 'mode':'train', 'mean':norm_stats[args.dataset][0], 'std':norm_stats[args.dataset][1],
                'noise':noise[args.dataset], 'noise_level': args.noise_level}
val_audio_conf = {'num_mel_bins': 128, 'target_length': target_length[args.dataset], 'freqm': 0, 'timem': 0, 'mixup': 0, 'dataset': args.dataset, 'mode':'evaluation', 'mean':norm_stats[args.dataset][0], 'std':norm_stats[args.dataset][1], 'noise':False}


In [27]:
train_loader = torch.utils.data.DataLoader(
    dataloaderV1.AudiosetDataset(args.data_train, audio_conf=audio_conf),
    batch_size=args.batch_size, shuffle=True, prefetch_factor=8, num_workers=1)

val_loader = torch.utils.data.DataLoader(
    dataloaderV1.AudiosetDataset(args.data_val, audio_conf=val_audio_conf),
    batch_size=args.batch_size*2, shuffle=True, prefetch_factor=16, num_workers=1)

test_loader = torch.utils.data.DataLoader(
    dataloaderV1.AudioTestDataset(args.data_test, audio_conf=val_audio_conf),
    batch_size=args.batch_size*2, shuffle=False, num_workers=args.num_workers, pin_memory=False
)

---------------the train dataloader---------------
now using following mask: 0 freq, 0 time
now using mix-up with rate 0.000000
now process speechcommands
use dataset mean -6.846 and std 5.565 to normalize the input.
now use noise augmentation
number of classes is 6
---------------the evaluation dataloader---------------
now using following mask: 0 freq, 0 time
now using mix-up with rate 0.000000
now process speechcommands
use dataset mean -6.846 and std 5.565 to normalize the input.
number of classes is 6
---------------the evaluation dataloader---------------
now using following mask: 0 freq, 0 time
now using mix-up with rate 0.000000
now process speechcommands
use dataset mean -6.846 and std 5.565 to normalize the input.
number of classes is 6


In [28]:
audio_model = ASTModel(label_dim=args.n_class, fstride=args.fstride, tstride=args.tstride, input_fdim=128,
                                input_tdim=target_length[args.dataset], imagenet_pretrain=args.imagenet_pretrain,
                                audioset_pretrain=args.audioset_pretrain, model_size='base384')


---------------AST Model Summary---------------
ImageNet pretraining: True, AudioSet pretraining: False
frequncey stride=10, time stride=10
number of patches=144


In [29]:
def train_epoch(
    trn_loader: DataLoader,
    model,
    optim: Union[torch.optim.SGD, torch.optim.Adam],
    device: torch.device,
    scheduler: torch.optim.lr_scheduler,
    config: argparse.Namespace):
    """Train the model for one epoch"""
    
    running_loss = 0
    num_total = 0.0
    train_acc, correct_train, target_count = 0, 0, 0
    ii = 0
    model.train()
    scaler = GradScaler()

    # set objective (Loss) functions
    weight = torch.FloatTensor([0.1, 0.9]).to(device)
    # criterion = nn.CrossEntropyLoss(weight=weight)
    criterion = nn.CrossEntropyLoss()
    for batch_x, batch_y in trn_loader:
        batch_size = batch_x.size(0)
        num_total += batch_size
        ii += 1
        batch_x = batch_x.to(device)
        # batch_y = batch_y.view(-1).type(torch.long).to(device)
        batch_y = batch_y.view(-1).type(torch.LongTensor).to(device)
        with autocast():
            batch_out = model(batch_x)#, Freq_aug=str_to_bool(config["freq_aug"]))
            batch_loss = criterion(batch_out, batch_y)
        running_loss += batch_loss.item() * batch_size
        optim.zero_grad()
        scaler.scale(batch_loss).backward()
        # batch_loss.backward()
        scaler.step(optim)
        scaler.update()
        # optim.step()

        if config.optim_config["scheduler"] in ["cosine", "keras_decay"]:
            scheduler.step()
        elif scheduler is None:
            pass
        else:
            raise ValueError("scheduler error, got:{}".format(scheduler))
        
        # accuracy
        _, predicted = torch.max(batch_out.data, 1)
        target_count += batch_y.size(0)
        correct_train += (batch_y == predicted).sum().item()
        train_acc = (100 * correct_train) / target_count

    running_loss /= num_total
    return running_loss, train_acc

In [30]:
def eval_epoch(
    trn_loader: DataLoader,
    model,
    device: torch.device,
    config: argparse.Namespace):
    """Train the model for one epoch"""
    running_loss = 0
    num_total = 0.0
    val_acc, correct_train, target_count = 0, 0, 0
    ii = 0
    model.eval()

    # set objective (Loss) functions
    weight = torch.FloatTensor([0.1, 0.9]).to(device)
    # criterion = nn.CrossEntropyLoss(weight=weight)
    criterion = nn.CrossEntropyLoss()
    for batch_x, batch_y in trn_loader:
        batch_size = batch_x.size(0)
        num_total += batch_size
        ii += 1
        batch_x = batch_x.to(device)
        # batch_y = batch_y.view(-1).type(torch.lo).to(device)
        batch_y = batch_y.view(-1).type(torch.LongTensor).to(device)
        with autocast():
            batch_out = model(batch_x) #model(batch_x, Freq_aug=str_to_bool(config["freq_aug"]))
            batch_loss = criterion(batch_out, batch_y)
        running_loss += batch_loss.item() * batch_size
        
        # accuracy
        _, predicted = torch.max(batch_out.data, 1)
        target_count += batch_y.size(0)
        correct_train += (batch_y == predicted).sum().item()
        val_acc = (100 * correct_train) / target_count
        
    running_loss /= num_total
    return running_loss, val_acc

In [31]:

class SGDRScheduler(torch.optim.lr_scheduler._LRScheduler):
    """SGD with restarts scheduler"""
    def __init__(self, optimizer, T0, T_mul, eta_min, last_epoch=-1):
        self.Ti = T0
        self.T_mul = T_mul
        self.eta_min = eta_min

        self.last_restart = 0

        super().__init__(optimizer, last_epoch)

    def get_lr(self):
        T_cur = self.last_epoch - self.last_restart
        if T_cur >= self.Ti:
            self.last_restart = self.last_epoch
            self.Ti = self.Ti * self.T_mul
            T_cur = 0

        return [
            self.eta_min + (base_lr - self.eta_min) *
            (1 + np.cos(np.pi * T_cur / self.Ti)) / 2
            for base_lr in self.base_lrs
        ]

def _get_optimizer(model_parameters, optim_config):
    """Defines optimizer according to the given config"""
    optimizer_name = optim_config['optimizer']

    if optimizer_name == 'sgd':
        optimizer = torch.optim.SGD(model_parameters,
                                    lr=optim_config['base_lr'],
                                    momentum=optim_config['momentum'],
                                    weight_decay=optim_config['weight_decay'],
                                    nesterov=optim_config['nesterov'])
    elif optimizer_name == 'adam':
        optimizer = torch.optim.Adam(model_parameters,
                                     lr=optim_config['base_lr'],
                                     betas=optim_config['betas'],
                                     weight_decay=optim_config['weight_decay'],
                                     amsgrad=str_to_bool(
                                         optim_config['amsgrad']))
    else:
        print('Un-known optimizer', optimizer_name)
        sys.exit()

    return optimizer

def str_to_bool(val):
    """Convert a string representation of truth to true (1) or false (0).
    Copied from the python implementation distutils.utils.strtobool

    True values are 'y', 'yes', 't', 'true', 'on', and '1'; false values
    are 'n', 'no', 'f', 'false', 'off', and '0'.  Raises ValueError if
    'val' is anything else.
    >>> str_to_bool('YES')
    1
    >>> str_to_bool('FALSE')
    0
    """
    val = val.lower()
    if val in ('y', 'yes', 't', 'true', 'on', '1'):
        return True
    if val in ('n', 'no', 'f', 'false', 'off', '0'):
        return False
    raise ValueError('invalid truth value {}'.format(val))

def cosine_annealing(step, total_steps, lr_max, lr_min):
    """Cosine Annealing for learning rate decay scheduler"""
    return lr_min + (lr_max -
                     lr_min) * 0.5 * (1 + np.cos(step / total_steps * np.pi))


def keras_decay(step, decay=0.0001):
    """Learning rate decay in Keras-style"""
    return 1. / (1. + decay * step)

def _get_scheduler(optimizer, optim_config):
    """
    Defines learning rate scheduler according to the given config
    """
    if optim_config['scheduler'] == 'multistep':
        scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer,
            milestones=optim_config['milestones'],
            gamma=optim_config['lr_decay'])

    elif optim_config['scheduler'] == 'sgdr':
        scheduler = SGDRScheduler(optimizer, optim_config['T0'],
                                  optim_config['Tmult'],
                                  optim_config['lr_min'])

    elif optim_config['scheduler'] == 'cosine':
        total_steps = optim_config['epochs'] * \
            optim_config['steps_per_epoch']

        scheduler = torch.optim.lr_scheduler.LambdaLR(
            optimizer,
            lr_lambda=lambda step: cosine_annealing(
                step,
                total_steps,
                1,  # since lr_lambda computes multiplicative factor
                optim_config['lr_min'] / optim_config['base_lr']))

    elif optim_config['scheduler'] == 'keras_decay':
        scheduler = torch.optim.lr_scheduler.LambdaLR(
            optimizer, lr_lambda=lambda step: keras_decay(step))
    else:
        scheduler = None
    return scheduler

def create_optimizer(model_parameters, optim_config):
    """Defines an optimizer and a scheduler"""
    optimizer = _get_optimizer(model_parameters, optim_config)
    scheduler = _get_scheduler(optimizer, optim_config)
    return optimizer, scheduler

In [None]:
# def main(args):

# define database related paths
# output_dir = Path(args.output_dir)
# database_path = Path(config["database_path"])
# label_path = Path(config['label_path'])

# set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device: {}".format(device))
# if device == "cpu":
#     raise ValueError("GPU not detected!")

# define model architecture
# model_config = args.config["model_config"]
model = ASTModel(label_dim=args.n_class, fstride=args.fstride, tstride=args.tstride, input_fdim=128,
                                input_tdim=target_length[args.dataset], imagenet_pretrain=args.imagenet_pretrain,
                                audioset_pretrain=args.audioset_pretrain, model_size='base384')
model = model.to(device)

# define dataloaders
# trn_loader, eval_loader = get_loader(database_path, label_path, config)

# get optimizer and scheduler
optim_config = args.optim_config
optim_config["epochs"] = args.n_epochs
optim_config["steps_per_epoch"] = len(train_loader)
optimizer, scheduler = create_optimizer(model.parameters(), optim_config)
optimizer_swa = SWA(optimizer)

# Training
best_acc = 0.0
for epoch in range(args.n_epochs):
    print("Start training epoch{:03d}".format(epoch))
    training_loss, training_acc = train_epoch(train_loader, model, optimizer, device, scheduler, args)
    eval_loss, eval_acc = eval_epoch(val_loader, model, device, args)
    tracker['train/loss'].log(training_loss)
    tracker['train/acc'].log(training_acc)
    tracker['eval/loss'].log(eval_loss)
    tracker['eval/acc'].log(eval_acc)
    print(f'[{epoch}] Training Loss : {training_loss} / Training Accuracy : {training_acc} | Eval Loss : {eval_loss} / Eval Accuracy : {eval_acc}')

    if eval_acc > best_acc: 
            torch.save(model.state_dict(), "./best_audio_model.pth")

# conf_metrics = confusion_matrix(pred, y)

# return best_model, conf_metrics

Device: cuda
---------------AST Model Summary---------------
ImageNet pretraining: True, AudioSet pretraining: False
frequncey stride=10, time stride=10
number of patches=144
Start training epoch000


Make Test set prediction Using Trained Model

In [None]:
# SETTING UP CODE TO RUN ON GPU
gpu_id = 0
device = torch.device(f'cuda:{gpu_id}' if torch.cuda.is_available() else 'cpu')

In [None]:
test_loader = torch.utils.data.DataLoader(
    dataloaderV1.AudioTestDataset(args.data_test, audio_conf=val_audio_conf),
    batch_size=args.batch_size*2, shuffle=False, num_workers=args.num_workers, pin_memory=False
)

---------------the evaluation dataloader---------------
now using following mask: 0 freq, 0 time
now using mix-up with rate 0.000000
now process speechcommands
use dataset mean -6.846 and std 5.565 to normalize the input.
number of classes is 6


In [None]:
# load best model
best_path = '/content/models/best_audio_model.pth'
pred_model = ASTModel(label_dim=args.n_class, fstride=args.fstride, tstride=args.tstride, input_fdim=128,
                                input_tdim=target_length[args.dataset], imagenet_pretrain=args.imagenet_pretrain,
                                audioset_pretrain=args.audioset_pretrain, model_size='base384')
pred_model.load_state_dict(torch.load(best_path), strict=False)
pred_model = pred_model.to(device)
pred_model.eval()

---------------AST Model Summary---------------
ImageNet pretraining: True, AudioSet pretraining: True
frequncey stride=10, time stride=10
number of patches=144


ASTModel(
  (v): DistilledVisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (blocks): ModuleList(
      (0): Block(
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=768, out_features=2304, bias=True)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (act): GELU()
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (drop): Dropout(p=0.0, inplace=False)
        )
      )
      (1): Block(
        (norm1): LayerNorm((768

In [None]:
pred_arr = []
for data in test_loader:
    data = data.to(device)
    with torch.no_grad():
        bt_preds = pred_model(data)
    pred_arr.extend(list(bt_preds.cpu().numpy()))

In [None]:
pred_np = np.array(pred_arr)
pred_labels = np.argmax(pred_np, axis=1)

_df = pd.read_csv('./spcup_2022_eval_part1/spcup_2022_eval_part1/labels_eval_part1.csv')
pred_df = pd.DataFrame({'track': _df['track'].values, 'label': pred_labels})
pred_df.to_csv('./result.csv')

pred_df.head()

Unnamed: 0,track,label
0,f14498230d583796987a2e9576695384.wav,1
1,2eb02f5f517ad346de780b22535824c0.wav,1
2,d14d2f68081e6969b65e30bdb9d67144.wav,1
3,40b86c3ca31ddb5ea16639e1229b8a7c.wav,0
4,193eb4a1f4028c84d26e36ef74593fa4.wav,0


In [None]:
tracker.stop()

Shutting down background jobs, please wait a moment...
Done!


Waiting for the remaining 1 operations to synchronize with Neptune. Do not kill this process.


All 1 operations synced, thanks for waiting!
