In [96]:
import os

import librosa
from sklearn import svm
import sklearn.model_selection as model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd
from statistics import mode

In [81]:
TRAIN_CSV_PATH = '/Users/maksimkoltugin/Desktop/код/ВШЭ ЦОС/Акустические события/train.csv'
TRAIN_WAV_FOLDER_PATH = '/Users/maksimkoltugin/Desktop/код/ВШЭ ЦОС/Акустические события/train'
SAMPLES_PER_WINDOW = 22050
HOP_SLICING = 11025  # number of samples per time-step in slicing audio
HOP_SPEC = 1024  # number of samples per time-step in spectrogram
N_MELS = 100  # number of bins in spectrogram

In [82]:
train_csv = pd.read_csv(TRAIN_CSV_PATH)
train_csv.sort_values(by=['fname'], inplace=True)

os.chdir(TRAIN_WAV_FOLDER_PATH)
sorted_wav_names = sorted(os.listdir())

X_train, X_test, Y_train, Y_test = model_selection.train_test_split(sorted_wav_names[1:], list(train_csv['label']), train_size=0.70, test_size=0.30,                   random_state=101)  # [1:] to avoid ".DS_STORE" file

N = len(X_train)

In [119]:
X_long = [] # contains lots of windows for every .wav file
Y_long = [] # contains a class for every window

for i, (name, target) in enumerate(zip(X_train, Y_train)):
    sound, sr = librosa.load(os.path.join(TRAIN_WAV_FOLDER_PATH, name))

    if len(sound) % SAMPLES_PER_WINDOW != 0:
        length = (len(sound) // SAMPLES_PER_WINDOW + 1) * SAMPLES_PER_WINDOW
        sound = np.pad(sound, (0, length - len(sound)))

    for s in range(0, len(sound) - SAMPLES_PER_WINDOW + HOP_SLICING, HOP_SLICING):
        window = sound[s:s + SAMPLES_PER_WINDOW]

        mels = librosa.feature.melspectrogram(y=window, sr=sr, n_mels=N_MELS, n_fft=HOP_SPEC * 2,
                                              hop_length=HOP_SPEC)
        mels = np.log(mels + 1e-9)  # adding small number to avoid log(0)

        X_long.append(mels.flatten())
        Y_long.append(target)

    if i % 300 == 0:
        print(i, 'from', N)

0 from 3978
300 from 3978
600 from 3978
900 from 3978
1200 from 3978
1500 from 3978
1800 from 3978
2100 from 3978
2400 from 3978
2700 from 3978
3000 from 3978
3300 from 3978
3600 from 3978
3900 from 3978


In [122]:
model = svm.SVC(kernel='poly', degree=3).fit(X_long, Y_long)

In [123]:
Windows_start = [None for _ in range(len(X_test))]
Windows_end = [None for _ in range(len(X_test))]
X_long_test = [] # contains lots of windows for every test .wav file

k = 0
for i, name in enumerate(X_test):
    sound, sr = librosa.load(os.path.join(TRAIN_WAV_FOLDER_PATH, name))

    if len(sound) % SAMPLES_PER_WINDOW != 0:
        length = (len(sound) // SAMPLES_PER_WINDOW + 1) * SAMPLES_PER_WINDOW
        sound = np.pad(sound, (0, length - len(sound)))

    Windows_start[i] = k
    for s in range(0, len(sound) - SAMPLES_PER_WINDOW + HOP_SLICING, HOP_SLICING):
        window = sound[s:s + SAMPLES_PER_WINDOW]

        mels = librosa.feature.melspectrogram(y=window, sr=sr, n_mels=N_MELS, n_fft=HOP_SPEC * 2,
                                              hop_length=HOP_SPEC)
        mels = np.log(mels + 1e-9)
        X_long_test.append(mels.flatten())
        k += 1
    Windows_end[i] = k

    if i % 100 == 0:
        print(i, 'from', len(X_test))

raw = model.predict(X_long_test)
predict = []
for i in range(len(X_test)):
    predict.append(mode(raw[Windows_start[i]:Windows_end[i]]))

0 from 1705
100 from 1705
200 from 1705
300 from 1705
400 from 1705
500 from 1705
600 from 1705
700 from 1705
800 from 1705
900 from 1705
1000 from 1705
1100 from 1705
1200 from 1705
1300 from 1705
1400 from 1705
1500 from 1705
1600 from 1705
1700 from 1705


In [124]:
accuracy = accuracy_score(Y_test, predict)
f1 = f1_score(Y_test, predict, average='weighted')
print(f'Accuracy: {accuracy}')
print(f'F1: {f1}')

Accuracy: 0.509090909090909
F1: 0.5021378948697343
