In [41]:
import warnings
warnings.filterwarnings(action='ignore') 

import os
import pickle
import random
import time
from collections import Counter, defaultdict
from functools import partial
from pathlib import Path

import librosa
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import Dataset, DataLoader

In [2]:
from google.colab import drive
drive.mount(('/content/drive'))
os.chdir('/content/drive/MyDrive/Tacademy/Speech-2/Dataset')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!ls

mels_train_curated.pkl	sample_submission.csv  train_curated.csv


In [5]:
filename = os.listdir('./')
filename

['mels_train_curated.pkl', 'train_curated.csv', 'sample_submission.csv']

In [7]:
test_df = pd.read_csv(filename[2])
test_df.head()

Unnamed: 0,fname,Accelerating_and_revving_and_vroom,Accordion,Acoustic_guitar,Applause,Bark,Bass_drum,Bass_guitar,Bathtub_(filling_or_washing),Bicycle_bell,...,Toilet_flush,Traffic_noise_and_roadway_noise,Trickle_and_dribble,Walk_and_footsteps,Water_tap_and_faucet,Waves_and_surf,Whispering,Writing,Yell,Zipper_(clothing)
0,4260ebea.wav,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,426eb1e0.wav,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,428d70bb.wav,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4292b1c9.wav,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,429c5071.wav,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
labels = test_df.columns[1:].tolist()
num_classes = len(labels)
num_classes

80

In [9]:
labels

['Accelerating_and_revving_and_vroom',
 'Accordion',
 'Acoustic_guitar',
 'Applause',
 'Bark',
 'Bass_drum',
 'Bass_guitar',
 'Bathtub_(filling_or_washing)',
 'Bicycle_bell',
 'Burping_and_eructation',
 'Bus',
 'Buzz',
 'Car_passing_by',
 'Cheering',
 'Chewing_and_mastication',
 'Child_speech_and_kid_speaking',
 'Chink_and_clink',
 'Chirp_and_tweet',
 'Church_bell',
 'Clapping',
 'Computer_keyboard',
 'Crackle',
 'Cricket',
 'Crowd',
 'Cupboard_open_or_close',
 'Cutlery_and_silverware',
 'Dishes_and_pots_and_pans',
 'Drawer_open_or_close',
 'Drip',
 'Electric_guitar',
 'Fart',
 'Female_singing',
 'Female_speech_and_woman_speaking',
 'Fill_(with_liquid)',
 'Finger_snapping',
 'Frying_(food)',
 'Gasp',
 'Glockenspiel',
 'Gong',
 'Gurgling',
 'Harmonica',
 'Hi-hat',
 'Hiss',
 'Keys_jangling',
 'Knock',
 'Male_singing',
 'Male_speech_and_man_speaking',
 'Marimba_and_xylophone',
 'Mechanical_fan',
 'Meow',
 'Microwave_oven',
 'Motorcycle',
 'Printer',
 'Purr',
 'Race_car_and_auto_racing',
 

In [10]:
train_curated = pd.read_csv(filename[1])
train_curated.head()

Unnamed: 0,fname,labels
0,0006ae4e.wav,Bark
1,0019ef41.wav,Raindrop
2,001ec0ad.wav,Finger_snapping
3,0026c7cb.wav,Run
4,0026f116.wav,Finger_snapping


In [11]:
with open(filename[0], 'rb') as curated:
    x_train = pickle.load(curated)

In [16]:
x_train[5].shape  # mel 128차원 데이터

(128, 785, 3)

In [13]:
y_train = np.zeros((len(train_curated), num_classes)).astype(int)
for i, row in enumerate(train_curated['labels'].str.split(',')):
    for label in row:
        idx = labels.index(label)
        y_train[i, idx] = 1

y_train.shape

(4970, 80)

# 평가지표 정리

In [39]:
def _one_sample_positive_class_precisions(scores, truth):
    """단일 샘플에 대한 각 실제 클래스의 정밀도를 계산함.
    Args:
        scores : np.array of (num_classes, ) giving the individual classifier scores.
        truth  : np.array of (num_classes, ) bools indicating which classes are true.
    Return:
        pos_class_indices    : np.array of indices of the true classes for this sample.
        pos_class_precisions : np.array of precisions corresponding to each of those classes.
    """
    num_classes = scores.shape[0]
    pos_class_indices = np.flatnonzero(truth > 0)
    
    if not len(pos_class_indices):
        return pos_class_indices, np.zeros(0)
    
    retrieved_classes = np.argsort(scores)[::-1] # 큰 값 index 순서대로
    class_rankings = np.zeros(num_classes, dtype=np.int)
    class_rankings[retrieved_classes] = range(num_classes)

    retrieved_class_true = np.zeros(num_classes, dtype=np.bool)
    retrieved_class_true[class_rankings[pos_class_indices]] = True
    retrieved_cumulative_hits = np.cumsum(retrieved_class_true)

    precision_at_hits = (
        retrieved_cumulative_hits[class_rankings[pos_class_indices]] /
        (1 + class_rankings[pos_class_indices].astype(np.float))
    )

    return pos_class_indices, precision_at_hits


def calculate_per_class_lwlrap(truth, scores):
    """Calculate label-weighted label-ranking average precision.
    
    Arguments:
        truth  : np.array of (num_samples, num_classes) giving boolean ground-truth
                of presence of that class in that sample.
        scores : np.array of (num_samples, num_classes) giving the classifier-under-
                test's ral-valued score for each class for each sample.

    Returns:
        per_class_lwlrap : np.array of (num_classes, ) giving the prior of each
                          class.
        weight_per_class : np.array of (num_classess, ) giving the prior of each
                          class within the truth labels. Then the overall unbalanced lwlrap is
                          simply np.sum(per_class_lwlrap * weight_per_class)

    """
    assert truth.shape == scores.shape
    
    num_samples, num_classes = scores.shape
    precisions_for_samples_by_classes = np.zeros((num_samples, num_classes))

    for sample_num in range(num_samples):
        pos_class_indices, precision_at_hits = (
            _one_sample_positive_class_precisions(scores[sample_num, :],
                                                  truth[sample_num, :]))
        precisions_for_samples_by_classes[sample_num, pos_class_indices] = (
            precision_at_hits)
    labels_per_class = np.sum(truth > 0, axis=0)
    weight_per_class = labels_per_class / float(np.sum(labels_per_class))

    per_class_lwlrap = (np.sum(precisions_for_samples_by_classes, axis=0) /
                        np.maximum(1, labels_per_class))
    
    return per_class_lwlrap, weight_per_class

In [42]:
_one_sample_positive_class_precisions(np.array([0.7, 0.3, 0.1]), np.array([1, 0, 1]))

(array([0, 2]), array([1.        , 0.66666667]))

In [43]:
y_true = np.array([[1, 0, 1], [0, 1, 1]])
y_score = np.array([[0.1, 0.7, 0.2], [0.1, 0.7, 0.2]])

_, precision_at_hits1 = _one_sample_positive_class_precisions(y_score[0], y_true[0])
print("sample 1 Score", precision_at_hits1)

_, precision_at_hits2 = _one_sample_positive_class_precisions(y_score[1], y_true[1])
print("sample 2 Score", precision_at_hits2)

score, weight = calculate_per_class_lwlrap(y_true, y_score)
print("Each class score", score)
print("Weight of each class", weight)

LwLRAP = (score * weight).sum()
print("LwLRAP", LwLRAP)

sample 1 Score [0.66666667 0.5       ]
sample 2 Score [1. 1.]
Each class score [0.66666667 1.         0.75      ]
Weight of each class [0.25 0.25 0.5 ]
LwLRAP 0.7916666666666666


# Model 구성

In [50]:
class ConvBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()

        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels, 8, kernel_size=3, stride=1, padding=0),
            nn.BatchNorm2d(8),
            nn.ELU(),
            nn.MaxPool2d(kernel_size=4)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(8, 16, kernel_size=3, stride=1, padding=0),
            nn.BatchNorm2d(16),
            nn.ELU(),
            nn.MaxPool2d(kernel_size=4)
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(16, 64, kernel_size=3, stride=1, padding=0),
            nn.ELU(),
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(64, out_channels, kernel_size=3, stride=1, padding=0),
        )

        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.zeros_(m.bias)

    def forward(self, x):
        residual = x
        x = x.permute(0, 3, 1, 2)
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = F.avg_pool2d(x, 2)
        return x

In [51]:
class Classifier(nn.Module):
    def __init__(self, num_classes):
        super().__init__()

        self.conv = nn.Sequential(
            ConvBlock(in_channels=3, out_channels=64),
        )

        self.fc = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.1),
            nn.Linear(128, num_classes),
        )
    
    def forward(self, x):
        x = self.conv(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

In [52]:
class FATTrainDataset(Dataset):
    def __init__(self, mels, labels):
        super().__init__()
        self.mels = mels
        self.labels = labels

    def __len__(self):
        return len(self.mels)

    def __getitem__(self, idx):
        audio = self.mels[idx]
        audio_clip = audio[:,:128,:]
        label = self.labels[idx]
        label = torch.from_numpy(label).float()
        audio_clip = torch.Tensor(audio_clip)

        return audio_clip, label

In [53]:
num_classes = y_train.shape[1]
x_trn, x_val, y_trn, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=970711)

In [54]:
train_dataset = FATTrainDataset(x_trn, y_trn)
valid_dataset = FATTrainDataset(x_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False)

In [55]:
best_epoch = -1
best_lwlrap = 0.
num_epochs = 80
batch_size = 8
test_batch_size = 64
lr = 3e-3

model = Classifier(num_classes=num_classes).cuda()
criterion = nn.BCEWithLogitsLoss().cuda()
optimizer = Adam(params=model.parameters(), lr=lr, amsgrad=False)

In [56]:
from fastprogress import master_bar, progress_bar

mb = master_bar(range(num_epochs))
for epoch in mb:
    start_time = time.time()
    model.train()
    avg_loss = 0.

    for x_batch, y_batch in progress_bar(train_loader, parent=mb):
        preds = model(x_batch.cuda())
        loss = criterion(preds, y_batch.cuda())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        avg_loss += loss.item() / len(train_loader)

    model.eval()
    valid_preds = np.zeros((len(x_val), num_classes))
    avg_val_loss = 0.

    for i, (x_batch, y_batch) in enumerate(valid_loader):
        preds = model(x_batch.cuda()).detach()
        loss = criterion(preds, y_batch.cuda())

        preds = torch.sigmoid(preds)
        valid_preds[i * test_batch_size: (i+1) * test_batch_size] = preds.cpu().numpy()

        avg_val_loss += loss.item() / len(valid_loader)

    score, weight = calculate_per_class_lwlrap(y_val, valid_preds)
    lwlrap = (score * weight).sum()

    if (epoch + 1) % 5 == 0:
        elapsed = time.time() - start_time
        mb.write(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} val_lwlrap: {lwlrap:.6f} time {elapsed:.0f}s')

    if lwlrap > best_lwlrap:
        best_epoch = epoch + 1
        best_lwlrap = lwlrap
        torch.save(model.state_dict(), 'weight_best.pt')

temp = {
    'best_epoch': best_epoch,
    'best_lwlrap': best_lwlrap,
}