<a href="https://colab.research.google.com/github/mitsu-h/BirdCLEF/blob/data-augmentation/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install audiomentations

In [None]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import soundfile as sf
from datetime import datetime

import torchaudio
from torchaudio import transforms
import torchvision
from audiomentations import Compose, AddGaussianNoise, PitchShift, TimeStretch, Shift

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm
from collections import defaultdict

import warnings
warnings.filterwarnings('ignore')

In [None]:
def torch_fix_seed(seed=42):
    # Python random
    random.seed(seed)
    # Numpy
    np.random.seed(seed)
    # Pytorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.use_deterministic_algorithms = True


torch_fix_seed()

# Load Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')
root_dir="/content/drive/MyDrive/colab/BirdCLEF/"
data_dir= os.path.join(root_dir, "inputs/")
model_dir = os.path.join(root_dir, "models/")

In [None]:
train_df = pd.read_csv(os.path.join(data_dir, 'train_metadata.csv'))
train_df.head()

# Quick EDA and Data Pre-processing

In [None]:
train_df.shape

In [None]:
train_df.describe()

In [None]:
train_df.info()

Looks like no null values are there.

In [None]:
# For this baseline notebook, we would consider following columns 
columns_of_interest = ['primary_label', 'rating', 'filename']
train_df = train_df[columns_of_interest]
train_df.head()

In [None]:
f, ax = plt.subplots(figsize=(20,8))
sns.countplot(x='rating', data=train_df, ax=ax)

we will consider only those audio files whose rating >= 3.0

In [None]:
train_df = train_df[train_df.rating >= 3.0]

In [None]:
train_df = train_df.reset_index(drop=True)
train_df.head()

In [None]:
labels = train_df.primary_label.unique().tolist()

In [None]:
label2id = {labels[i]: i for i in range(len(labels))}

In [None]:
labels[0], label2id['afrsil1']

In [None]:
train_audio_dir = os.path.join(data_dir, 'train_audio')

In [None]:
# for i in tqdm(range(len(train_df))):
#     filename = train_df.loc[i,'filename']
#     sig, sr = torchaudio.load(os.path.join(train_audio_dir, filename))
#     train_df.loc[i, 'num_channels'] = sig.shape[0]  # number of audio channels (mono/stereo)
#     train_df.loc[i, 'signal_len'] = sig.shape[1]  # signal length
#     train_df.loc[i, 'sampling_rate'] = sr

In [None]:
# train_df['num_channels'] = train_df['num_channels'].astype('int64')
# train_df['signal_len'] = train_df['signal_len'].astype('int64')
# train_df['sampling_rate'] = train_df['sampling_rate'].astype('int64')
# train_df.head()

In [None]:
# train_df.sampling_rate.unique()

we have unqiue sampling rate i.e., 32 KHz

For a 1 second video, the array size will be 32000

In [None]:
# sns.countplot(train_df.num_channels)

There are more number of two channel audios. So we will convert mono (1 channel ) audio to stereo (2 channel) audio by replication.

In [None]:
def MonoToStereo(aud, num_channel=2):
    sig, sr = aud
    if sig.shape[0] == num_channel:
        return aud
    else:
        stereo_sig = torch.cat([sig, sig])
    
    return (stereo_sig, sr)

In [None]:
# train_df.signal_len.min(), train_df.signal_len.max()

In [None]:
# f, ax = plt.subplots(figsize=(15,8))
# sns.distplot(train_df.signal_len, ax=ax)

we need to have signal length to be of same size. So we will either pad the signal or truncate the signal.

In [None]:
# Let's consider the length of all videos to be 10 seconds (A hyperparam - to be tuned)
max_len_ms = 10000 # 10k milliseconds ~ 10 seconds

def pad_signal(aud, max_len_ms):
    sig, sr = aud
    num_channels, sig_len = sig.shape
    max_len = sr // 1000 * max_len_ms
    
    if sig_len > max_len:
        sig = sig[:, :max_len]
    elif sig_len < max_len:
        # padding both sides of the signal
        pad_begin_len = random.randint(0, max_len - sig_len)
        pad_end_len = max_len - sig_len - pad_begin_len
        
        # pad with zeros
        pad_begin = torch.zeros((num_channels, pad_begin_len))
        pad_end = torch.zeros((num_channels, pad_end_len))
        
        sig = torch.cat((pad_begin, sig, pad_end), 1)
    
    return (sig, sr)

# Data Augmentation on raw audio
下記リンクから実装内容を選択<br>
https://github.com/iver56/audiomentations/tree/master/audiomentations/augmentations

In [None]:
augment = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    #TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
    #PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
    #Shift(min_fraction=-0.5, max_fraction=0.5, p=0.5), 
])

In [None]:
# Convert augmented audio to Mel Spectrogram
def mel_spec(aug_data, n_mels=64, n_fft=1024, hop_len=None):
    sig = aug_data
    sr = audio[1]
    top_db = 80
    
    spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)
    # shape of spec: (channels, n_mels, time)
    
    # Convert to decibels
    spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
    
    # Add channel
    spec = torch.cat([spec, spec.mean(dim=0, keepdim=True)])
    return spec

In [None]:
# Data augmentation on mel spectrogram: Time and Frequency Masking
def spectro_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
    _, n_mels, n_steps = spec.shape
    mask_value = spec.mean()
    aug_spec = spec
    
    freq_mask_param = max_mask_pct * n_mels
    for _ in range(n_freq_masks):
        aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)
        
    time_mask_param = max_mask_pct * n_steps
    for _ in range(n_time_masks):
        aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)
        
    return aug_spec

Preprocessing on one signal sample

In [None]:
idx = random.randint(0, len(train_df))
filename = train_df.loc[idx,'filename']
sig, sr = torchaudio.load(os.path.join(train_audio_dir, filename))
audio_sample = (sig, sr)

# mono to stereo (if mono)
audio = MonoToStereo(audio_sample)

# Pad or Truncate
audio = pad_signal(audio, max_len_ms)

# Augment on raw audio
aug_data = augment(audio[0], audio[1])

# Convert to Mel Spectrogram
spec = mel_spec(aug_data)

# resize Mel Spec
spec = torchvision.transforms.Resize((224, 224))(spec)

# Augment on mel spec
aug_spec = spectro_augment(spec)
aug_spec.shape

In [None]:
aug_spec_np = aug_spec.permute(1,2,0).numpy()
f, ax = plt.subplots(figsize=(15,8))
plt.imshow(aug_spec_np[:, :, 0])
plt.show()
f, ax = plt.subplots(figsize=(15,8))
plt.imshow(aug_spec_np[:, :, 1])
plt.show()
f, ax = plt.subplots(figsize=(15,8))
plt.imshow(aug_spec_np[:, :, 1])
plt.show()

# Building Custom Dataset

In [None]:
class CustomDataset(Dataset):
    def __init__(self, audio_dir, df, max_len_ms=10000, shift_limit=0.4, input_size=224):
        self.train_audio_dir = audio_dir
        self.train_df = df
        self.max_len_ms = max_len_ms
        self.shift_limit = shift_limit
        self.input_size = input_size
        
    def __len__(self):
        return len(self.train_df)

    def __getitem__(self, index):
        filename = self.train_df.loc[index, 'filename']
        label = self.train_df.loc[index, 'primary_label']
        
        sig, sr = torchaudio.load(os.path.join(self.train_audio_dir, filename))
        audio = (sig, sr)
        
        ##################        
        # process signal #
        ##################
        
        # mono to stereo (if mono)
        audio = MonoToStereo(audio)

        # Pad or Truncate
        audio = pad_signal(audio, self.max_len_ms)

        # Augment raw audio
        aug_data = augment(audio[0], audio[1])

        # Convert to Mel Spectrogram
        spec = mel_spec(aug_data)
        
        # resize Mel Spec
        spec = torchvision.transforms.Resize((self.input_size, self.input_size))(spec)

        # Augment mel spec
        aug_spec = spectro_augment(spec)
        
        return aug_spec, torch.tensor(label2id[label])

In [None]:
max_len_ms = 10000
shift_limit = 0.4
input_size = 224
dataset = CustomDataset(train_audio_dir, train_df, max_len_ms, shift_limit, input_size)

In [None]:
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

In [None]:
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
inputs, targets = next(iter(train_loader))
inputs.shape, targets.shape

Cool!!

Let's build model architecture.

# Model Building

### model作成の関数
基本的に、モデルを変更する場合はここの修正を行う！

In [None]:
def create_model(labels, device):
  model = torchvision.models.resnet50(num_classes=len(labels))
  model.fc = torch.nn.Sequential(
    torch.nn.Linear(
        in_features=model.fc.in_features,
        out_features=model.fc.out_features
    ),
    torch.nn.Sigmoid()
  )
  return model.to(device)

### modelのチェック

In [None]:
model = create_model(labels, 'cpu')
model

In [None]:
model.eval()
output = model(inputs)
output.shape

In [None]:
LERANING_RATE = 1e-4

# Defining loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LERANING_RATE)

# Model Training

In [None]:
def train_epoch(model, data_loader, device, criterion, optimizer):
    model.train()

    losses = []
    correct = 0
    total = 0

    for batch_idx, (x, y) in enumerate(tqdm(data_loader)):
        x = x.to(device) 
        y = y.to(device) 

        output = model(x)

        loss = criterion(output, y)

        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        _, preds = torch.max(output, dim=1)
        correct += (preds.cpu() == y.cpu()).sum().item()
        total += preds.size(0)

    acc = (correct * 1.0) / total

    return acc, np.mean(losses)

In [None]:
def val_epoch(model, data_loader, device, criterion):
    model.eval()

    losses = []
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (x, y) in enumerate(tqdm(data_loader)):
            x = x.to(device) 
            y = y.to(device) 

            output = model(x)

            loss = criterion(output, y)

            losses.append(loss.item())

            _, preds = torch.max(output, dim=1)
            correct += (preds.cpu() == y.cpu()).sum().item()
            total += preds.size(0)

    acc = (correct * 1.0) / total

    return acc, np.mean(losses)

In [None]:
def train(model, epochs, device, train_loader, val_loader, criterion, optimizer, model_name):
    history = defaultdict(list)

    best_val_acc = 0

    for epoch in range(epochs):
        print(f'Epoch: {epoch + 1}/{epochs}')
        print('-' * 10)
        print('Training')
        train_acc, train_loss = train_epoch(model, train_loader, device, criterion, optimizer)
        print('\nValidating')
        val_acc, val_loss = val_epoch(model, val_loader, device, criterion)

        print(f'\nTrain Loss: {train_loss}\tTrain Acc: {train_acc}')
        print(f'Val Loss: {val_loss}\tVal Acc: {val_acc}')

        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), model_name)

    return history

In [None]:
EPOCHS = 10
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
# modelの上書きを防ぐため、指定した名前or日付をつける
suffix = None # 日付以外の名前を付けたい場合はここを適当な文字列に変更
suffix = suffix or datetime.now().strftime("%Y%m%d%H%M")
model_name = os.path.join(model_dir, f"best_model_{suffix}.pth.tar")

In [None]:
model = model.to(DEVICE)
history = train(model, EPOCHS, DEVICE, train_loader, val_loader, criterion, optimizer, model_name)

# Model Testing

In [None]:
model.load_state_dict(torch.load(model_name))