<a href="https://colab.research.google.com/github/mitsu-h/BirdCLEF/blob/spec_rgb_softmax/submission.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 環境変数の設定
- colaboratoryでの動作確認："dev"
- Kaggle Notebookでsubmission："sub"

In [None]:
ENV="dev"
model_name = "best_model_202204040626.pth.tar"

以下で変数の設定

In [None]:
import os
if ENV == "dev":
  from google.colab import drive
  drive.mount('/content/drive')
  root_dir="/content/drive/MyDrive/colab/BirdCLEF/"
  data_dir= os.path.join(root_dir, "inputs/")
  model_path = os.path.join(root_dir, "models/",model_name)
else:
  root_dir= "../input/"
  data_dir = os.path.join(root_dir, "birdclef-2022/")
  model_path = os.path.join(root_dir,"pretrained-model/", model_name)


# Import Libraries

In [None]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import soundfile as sf

from sklearn.metrics import f1_score

import torchaudio
from torchaudio import transforms
import torchvision

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm
from collections import defaultdict

import warnings
warnings.filterwarnings('ignore')

### シード固定

In [None]:
def torch_fix_seed(seed=42):
    # Python random
    random.seed(seed)
    # Numpy
    np.random.seed(seed)
    # Pytorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.use_deterministic_algorithms = True


torch_fix_seed()

# Load Data

In [None]:
train_df = pd.read_csv(os.path.join(data_dir, 'train_metadata.csv'))
# train_df.head()

In [None]:
train_df = train_df[train_df.rating >= 3.0]

In [None]:
train_df = train_df.reset_index(drop=True)
train_df.head()

In [None]:
labels = train_df.primary_label.unique().tolist()

In [None]:
label2id = {labels[i]: i for i in range(len(labels))}

There are more number of two channel audios. So we will convert mono (1 channel ) audio to stereo (2 channel) audio by replication.

In [None]:
def MonoToStereo(aud, num_channel=2):
    sig, sr = aud
    if sig.shape[0] == num_channel:
        return aud
    else:
        stereo_sig = torch.cat([sig, sig])
    
    return (stereo_sig, sr)

we need to have signal length to be of same size. So we will either pad the signal or truncate the signal.

In [None]:
# Let's consider the length of all videos to be 10 seconds (A hyperparam - to be tuned)
max_len_ms = 10000 # 10k milliseconds ~ 10 seconds

def pad_signal(aud, max_len_ms):
    sig, sr = aud
    num_channels, sig_len = sig.shape
    max_len = sr // 1000 * max_len_ms
    
    if sig_len > max_len:
        sig = sig[:, :max_len]
    elif sig_len < max_len:
        # padding both sides of the signal
        pad_begin_len = random.randint(0, max_len - sig_len)
        pad_end_len = max_len - sig_len - pad_begin_len
        
        # pad with zeros
        pad_begin = torch.zeros((num_channels, pad_begin_len))
        pad_end = torch.zeros((num_channels, pad_end_len))
        
        sig = torch.cat((pad_begin, sig, pad_end), 1)
    
    return (sig, sr)

In [None]:
# Data Augmentation on raw audio: Time shift
def time_shift(aud, shift_limit):
    sig, sr = aud
    _, sig_len = sig.shape
    shift_amt = int(random.random() * shift_limit * sig_len)
    return (sig.roll(shift_amt), sr)

In [None]:
# Convert augmented audio to Mel Spectrogram
def mel_spec(aud, n_mels=128, n_fft=None, hop_len=None):
    sig, sr = aud
    top_db = 80
    n_fft = n_fft or sr // 10
    
    spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)
    # shape of spec: (channels, n_mels, time)
    
    # Convert to decibels
    spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
    
    # Add channel
    spec = torch.cat([spec, spec.mean(dim=0, keepdim=True)])
    return spec

In [None]:
def mono_to_color(X, eps=1e-6, mean=None, std=None):
    mean = mean or X.mean()
    std = std or X.std()
    X = (X - mean) / (std + eps)
    
    _min, _max = X.min(), X.max()

    if (_max - _min) > eps:
        V = torch.clip(X, _min, _max)
        V = 255 * (V - _min) / (_max - _min)
        V = V.to(torch.uint8)
    else:
        V = torch.zeros_like(X, dtype=torch.uint8)

    return V

def normalize(image):
        image = image.to(torch.float32) / 255.0
        return image  


In [None]:
# Data augmentation on mel spectrogram: Time and Frequency Masking
def spectro_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
    _, n_mels, n_steps = spec.shape
    mask_value = spec.mean()
    aug_spec = spec
    
    freq_mask_param = max_mask_pct * n_mels
    for _ in range(n_freq_masks):
        aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)
        
    time_mask_param = max_mask_pct * n_steps
    for _ in range(n_time_masks):
        aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)
        
    return aug_spec

Preprocessing on one signal sample

# Building Custom Dataset

In [None]:
class CustomDataset(Dataset):
    def __init__(self, audio_dir, df, max_len_ms=10000, shift_limit=0.4, input_size=224):
        self.train_audio_dir = audio_dir
        self.train_df = df
        self.max_len_ms = max_len_ms
        self.shift_limit = shift_limit
        self.input_size = input_size
        
    def __len__(self):
        return len(self.train_df)

    def __getitem__(self, index):
        filename = self.train_df.loc[index, 'filename']
        label = self.train_df.loc[index, 'primary_label']
        
        sig, sr = torchaudio.load(os.path.join(self.train_audio_dir, filename))
        audio = (sig, sr)
        
        ##################        
        # process signal #
        ##################
        
        # mono to stereo (if mono)
        audio = MonoToStereo(audio)

        # Pad or Truncate
        audio = pad_signal(audio, self.max_len_ms)

        # Augment raw audio
        audio = time_shift(audio, shift_limit=self.shift_limit)

        # Convert to Mel Spectrogram
        spec = mono_to_color(mel_spec(audio))

        # Normarize
        spec = normalize(spec)

        # Augment mel spec
        aug_spec = spectro_augment(spec)
        
        return aug_spec, torch.tensor(label2id[label])

In [None]:
train_audio_dir = os.path.join(data_dir, "train_audio/")
max_len_ms = 10000
shift_limit = 0.4
input_size = 224
dataset = CustomDataset(train_audio_dir, train_df, max_len_ms, shift_limit, input_size)

In [None]:
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

In [None]:
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
inputs, targets = next(iter(train_loader))
inputs.shape, targets.shape

Cool!!

Let's build model architecture.

# Model Building

In [None]:
def create_model(labels, device):
  model = torchvision.models.resnet50(pretrained=True)
  model.fc = torch.nn.Sequential(
    torch.nn.Linear(
        in_features=model.fc.in_features,
        out_features=len(labels)
    ),
    torch.nn.Softmax()
  )
  return model.to(device)

In [None]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
model = create_model(labels, DEVICE)
model

# Model Testing

In [None]:
if DEVICE == "cpu":
  model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
else:
  model.load_state_dict(torch.load(model_path))

### validationのスコアチェック
選択してるモデルのミス防止のため、実行結果を確認する

想定よりも実行時間がかかるので、commit & run all時には動かないようにする

In [None]:
# 実行したいとき：True, そうでなければFalseとする
# TODO: もっと使いやすい実装に修正
if True:
  device = DEVICE
  model.eval()

  macro_f1 = 0

  with torch.no_grad():
      for batch_idx, (x, y) in enumerate(tqdm(val_loader)):
          x = x.to(device) 
          y = y.to(device) 

          output = model(x)

          _, preds = torch.max(output, dim=1)
          macro_f1 += f1_score(y.cpu().numpy(),preds.cpu().numpy(), average="macro")
          
  macro_f1 /= val_size
  print(f"macro f1: {macro_f1}")

In [None]:
test_df = pd.read_csv(os.path.join(data_dir,'test.csv'))
sample_sub_df = pd.read_csv(os.path.join(data_dir,'sample_submission.csv'))

In [None]:
test_df.head()

In [None]:
sample_sub_df.head()

In [None]:
test_audio_dir = os.path.join(data_dir,'test_soundscapes')
for idx in tqdm(range(len(test_df))):
    audio_id = test_df.loc[idx, 'file_id']
    true_label = test_df.loc[idx, 'bird']
    end_time = test_df.loc[idx, 'end_time']
    
    path = os.path.join(test_audio_dir, audio_id, '.ogg')

    if os.path.isfile(path):
        sig, sr = torchaudio.load(file_pth)
        
        rows = sig.shape[1] // (32000 *5)
        sig = sig.reshape(rows, -1)

        row_id = end_time // 5

        sig = sig[row_id-1].reshape(1,-1)

        audio = MonoToStereo((sig, sr))
        audio = pad_signal(audio, 10000)
        audio = time_shift(audio, shift_limit=0.4)
        spec = mel_spec(audio)
        aug_spec = spectro_augment(spec)
        aug_spec = aug_spec.unsqueeze(0)
        output = model(aug_spec)

        _, pred = torch.max(output, dim=1)
        if labels[pred] == true_label:
            sample_sub_df.loc[idx, 'target'] = True
        else:
            sample_sub_df.loc[idx, 'target'] = False
        
    else:
        pred = True if random.randint(0,1) else False
        sample_sub_df.loc[idx, 'target'] = pred
        continue

# Make submission

In [None]:
sample_sub_df.head()

In [None]:
if ENV=="sub":
  sample_sub_df.to_csv('submission.csv', index=False)

In [None]:
print('Done!')