In [1]:
import torch
from pathlib import Path
import pandas as pd
from freesound.preprocessor import Preprocessor
import os
from functools import partial
from psutil import cpu_count
from fastprogress import progress_bar
from freesound.utils.general import load_yaml
from torchvision import transforms
import numpy as np
from torch.utils.data import Dataset, DataLoader

N_JOBS = cpu_count()
os.environ['MKL_NUM_THREADS'] = str(N_JOBS)
os.environ['OMP_NUM_THREADS'] = str(N_JOBS)
DataLoader = partial(DataLoader, num_workers=N_JOBS)

In [2]:
INP_SIZE = 128
BATCH_SIZE = N_JOBS * 8
PREPROCESSOR_CONFIG_NAME = 'tf_2048'

In [None]:
cfg = {'sr': 44100, 'fft_length': 2048}
path_to_im_fn_tf(path, cfg)

In [3]:
import os
from pathlib import Path
from scipy.io import wavfile
from fastprogress import progress_bar
import tensorflow as tf
import numpy as np

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
tf.enable_eager_execution(config=config)


class SpectGetter:

    def signal_to_melspect(self, signals, *args, **kwargs):
        stfts = tf.contrib.signal.stft(
            signals,
            frame_length=self.frame_length,
            frame_step=self.frame_step,
            fft_length=self.fft_length
        )

        magnitude_spectrograms = tf.abs(stfts)
        self.num_spectrogram_bins = magnitude_spectrograms.shape[-1].value
        self.linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix(
            self.num_mel_bins, self.num_spectrogram_bins, 44100, self.lower_edge_hertz,
            self.upper_edge_hertz)

        mel_spectrograms = tf.tensordot(
            magnitude_spectrograms, self.linear_to_mel_weight_matrix, 1)

        return mel_spectrograms

    def __init__(self,
                 frame_length=343 * 2,
                 frame_step=343,
                 fft_length=1024,
                 lower_edge_hertz=50,
                 upper_edge_hertz=9000,
                 num_mel_bins=128,
                 *args, **kwargs):
        self.frame_length = frame_length
        self.frame_step = frame_step
        self.fft_length = fft_length
        self.lower_edge_hertz, self.upper_edge_hertz, self.num_mel_bins = (
            lower_edge_hertz, upper_edge_hertz, num_mel_bins)
        # Warp the linear-scale, magnitude spectrograms into the mel-scale.


def path_to_logmelspect(audio_file, config):
    sg = SpectGetter(**config)
    sr, samples = wavfile.read(audio_file)
    sig = tf.reshape((samples / 32768).astype(np.float32), [1, -1])
    S = sg.signal_to_melspect(sig).numpy()
    S = np.maximum(-80., 10 * np.log10(S + 1e-80) - 10 * 3.4)
    S = (S + 80.) / 80.
    S = np.transpose(S, [0, 2, 1])  # [1, freqbins, time]
    S = (S * 255).astype(np.uint8)
    whr = np.where(S[0].mean(0) > 0)[0]
    if len(whr) > 128:
        S = S[:, :, whr[0]:whr[-1]]
    return S


def path_to_im_fn_tf(wavpath, config):
    return path_to_logmelspect(wavpath, config)


In [4]:
def get_fnames_and_starts(preproc, hop_size=64,  inp_size=INP_SIZE):
    starts, fnames = [], []
    for k in sorted(list(preproc.cache.keys()), reverse=True):
        num_windows = preproc[k].shape[2] // hop_size
        time_dim = preproc[k].shape[2]
        if time_dim > inp_size:
            starts_this = np.linspace(0, time_dim - inp_size, num_windows).round().astype(int)
            fnames += np.repeat(k, len(starts_this)).tolist()
            starts += starts_this.tolist()
        else:
            fnames += [k]
            starts += [0]
    return [np.array(i) for i in [fnames, starts]]

class DS(Dataset):
    def __init__(self, preproc, fnames, starts, desired_length=INP_SIZE):
        super().__init__()
        self.preproc = preproc
        self.fnames = fnames
        self.starts = starts
        self.desired_length = desired_length
        self.transforms = transforms.ToTensor()
        
    def __len__(self):
        return len(self.fnames)
    
    def prep_img(self, image, crop):
        # Tile
        if image.shape[0] == 1:
            image = np.tile(image, [3, 1, 1])
        image = np.transpose(image, [1, 2, 0])
        # Crop
        time_dim = image.shape[1]
        diff = time_dim - self.desired_length
        if diff > 0:
            image = image[:, crop:crop + self.desired_length]
        elif diff < 0:
            tmp = np.zeros([image.shape[0], self.desired_length, *image.shape[2:]],
                           dtype=image.dtype)
            crop = -diff // 2
            tmp[:, crop:crop + image.shape[1]] = image
            image = tmp
        else:
            crop = 0
        # To tensor
        image = self.transforms(image)
        return image.div_(255)
    
    def __getitem__(self, idx):
        fname = self.fnames[idx]
        start = self.starts[idx]
        image = self.preproc[fname]
        image = self.prep_img(image, start)
        return image

In [5]:
sample_csv_path = Path('/media/hd1/Liam/fs/input/freesound-audio-tagging-2019/sample_submission.csv')
df_sample = pd.read_csv(sample_csv_path)
labels = df_sample.columns[1:].tolist()
all_wavnames = df_sample.fname.values

In [7]:
load_yaml('config/preprocessing/{}.yaml'.format(PREPROCESSOR_CONFIG_NAME))

{'path_to_im_fn_name': 'tf', 'params': {'sr': 44100, 'fft_length': 2048}}

In [5]:
preproc = Preprocessor(dont_load=True, config=load_yaml('config/preprocessing/{}.yaml'.format(PREPROCESSOR_CONFIG_NAME)))
preproc.fill_cache(all_wavnames)

In [6]:
model = torch.jit.load('ckpts/trace.pt')

In [7]:
fnames, starts = get_fnames_and_starts(preproc)
ds = DS(preproc, fnames, starts)
loader = DataLoader(ds, shuffle=False, batch_size=BATCH_SIZE)

In [8]:
# from freesound.archis.large import Classifier
# model = Classifier(80)
# model.load_state_dict(torch.load('ckpts/student/e14eb8b559752c4e/weight_epoch413.pt'))

In [9]:
preds_all = None
model = model.cuda().eval()
for x in progress_bar(loader):
    with torch.no_grad():
        preds = torch.sigmoid(model(x.cuda())).cpu().numpy()
        if preds_all is None:
            preds_all = preds
        else:
            preds_all = np.concatenate([preds_all, preds], 0)

In [10]:
def average_preds_and_add_to_sample(preds_all, df_sample):
    df = pd.DataFrame(preds_all, index=fnames)
    df.index.name = 'fname'
    df = df.groupby('fname').mean()
    labels = df_sample.columns[1:].tolist()
    df_sample[labels] = df.loc[df_sample['fname']].values
    return df_sample

In [11]:
df_sample = average_preds_and_add_to_sample(preds_all, df_sample)
df_sample.to_csv('submission400to450.csv', index=False)

In [12]:
df_sample

Unnamed: 0,fname,Accelerating_and_revving_and_vroom,Accordion,Acoustic_guitar,Applause,Bark,Bass_drum,Bass_guitar,Bathtub_(filling_or_washing),Bicycle_bell,...,Toilet_flush,Traffic_noise_and_roadway_noise,Trickle_and_dribble,Walk_and_footsteps,Water_tap_and_faucet,Waves_and_surf,Whispering,Writing,Yell,Zipper_(clothing)
0,000ccb97.wav,0.000373,8.978759e-06,1.639552e-04,0.000597,0.000482,1.275773e-02,1.289494e-04,0.005711,3.475341e-04,...,0.000633,0.000082,5.110742e-04,0.002271,1.474230e-03,0.000293,1.347977e-03,3.852473e-03,0.002625,0.002529
1,0012633b.wav,0.084659,4.910018e-03,7.780921e-03,0.010794,0.003816,3.043076e-02,4.130356e-02,0.009194,6.060930e-04,...,0.014949,0.014903,2.154123e-03,0.017923,1.380114e-02,0.010445,4.894793e-03,1.106718e-02,0.002072,0.051454
2,001ed5f1.wav,0.002526,9.001810e-05,1.297364e-03,0.001153,0.001311,8.982476e-02,9.375236e-04,0.001691,3.558788e-03,...,0.001222,0.000918,1.813571e-03,0.029775,1.276248e-03,0.001507,3.705875e-04,1.954583e-03,0.000908,0.005144
3,00294be0.wav,0.000082,7.651154e-07,8.673888e-05,0.000017,0.000406,3.197947e-04,1.635475e-05,0.000084,7.721506e-04,...,0.000260,0.000088,1.690436e-04,0.001187,2.088904e-04,0.000391,1.014708e-04,2.852833e-03,0.000170,0.007631
4,003fde7a.wav,0.000121,3.447088e-06,1.363830e-04,0.000170,0.000305,2.201384e-04,4.520089e-05,0.000328,8.663869e-01,...,0.001509,0.000249,2.289581e-04,0.000235,7.493175e-04,0.000025,1.163918e-04,1.579839e-04,0.000404,0.000155
5,0040ccc9.wav,0.002052,1.139872e-02,4.459304e-03,0.000135,0.000288,7.490613e-02,1.934681e-02,0.000057,6.317490e-06,...,0.000234,0.002259,5.345864e-05,0.001557,5.278379e-05,0.000059,1.097072e-03,4.590161e-04,0.000491,0.000753
6,0046b732.wav,0.001014,1.820211e-05,7.969465e-04,0.000217,0.003746,6.121861e-03,7.717916e-05,0.001484,6.280867e-03,...,0.002637,0.012376,4.596799e-04,0.002260,8.753532e-04,0.000578,5.901331e-04,2.711879e-03,0.004010,0.004385
7,004f3bbc.wav,0.003564,5.000666e-05,4.197586e-02,0.000150,0.000283,1.691683e-01,4.833388e-01,0.000067,1.705477e-03,...,0.000261,0.000311,1.602941e-04,0.000478,2.354035e-04,0.000128,2.177169e-04,6.886101e-04,0.000171,0.000574
8,00526050.wav,0.000244,1.689644e-07,1.874828e-06,0.000013,0.000003,9.746875e-01,8.663461e-05,0.000001,5.294952e-06,...,0.000004,0.000107,7.084560e-08,0.000168,2.128381e-07,0.000006,2.300973e-05,5.471462e-05,0.000022,0.000012
9,00559da4.wav,0.001523,9.960811e-06,2.588708e-05,0.000692,0.002024,9.128149e-04,6.141451e-05,0.002172,4.027354e-04,...,0.000535,0.000431,2.022945e-03,0.087469,8.010365e-04,0.008459,5.284685e-03,1.030699e-01,0.001512,0.255662
