In [1]:
import math
from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np
from os import listdir
import os.path
from os.path import isdir, isfile
from random import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
import soundfile
from xml.etree import ElementTree
from warnings import warn

In [2]:
def read_X(path_to_wav, frame_rate_hz=100, subsampling_step=4):
    # scipy.io.wavfile is not able to read 24-bit data hence the need to use this alternative library
    samples, sample_rate = soundfile.read(path_to_wav)
    if len(samples.shape) > 1:
        warn('Cannot handle stereo signal (' + path_to_wav + '), skipping file.')
        return None, -1
    if sample_rate % frame_rate_hz != 0:
        raise ValueError('Sample rate ' + str(sample_rate) + ' % frame rate ' + str(frame_rate_hz) + ' != 0')
    samples_per_frame = int(sample_rate / frame_rate_hz)
    offset = 0
    X = []
    # Cut off last samples
    while offset <= len(samples) - samples_per_frame:
        X.append(samples[offset:offset + samples_per_frame:subsampling_step])
        offset += samples_per_frame

    X = np.array(X)
    return X, offset / sample_rate


def read_y_xml(path_to_xml, length_seconds, dataset, frame_rate_hz=100):
    tree = ElementTree.parse(path_to_xml)
    root = tree.getroot()
    y = _init_y(length_seconds, frame_rate_hz)
    for root_child in root:
        if root_child.tag == 'transcription':
            for event in root_child:
                if event.tag != 'event':
                    raise ValueError('Unexpected XML element, expected event, got ' + event.tag)
                for event_child in event:
                    if event_child.tag == 'onsetSec':
                        onset_time = float(event_child.text)
                        index = _onset_index(onset_time, frame_rate_hz)
                        _set_onset_label(y, index, dataset)
            break

    y = np.reshape(y, (-1, 1))
    return y

def read_y_csv(path_to_csv, length_seconds, dataset, frame_rate_hz=100):
    y = _init_y(length_seconds, frame_rate_hz)
    with open(path_to_csv) as f:
        for line in f:
            line_split = line.rstrip().split(',')
            onset_time = float(line_split[0])
            index = _onset_index(onset_time, frame_rate_hz)
            _set_onset_label(y, index, dataset)
    
    y = np.reshape(y, (-1, 1))
    return y
            
def _init_y(length_seconds, frame_rate_hz):
    return np.zeros(round(frame_rate_hz * length_seconds), dtype=np.int8)

def _onset_index(onset_time, frame_rate_hz):
    return int(onset_time * frame_rate_hz)

def _set_onset_label(y, index, dataset):
    start = index
    end = index
    if dataset == 'ds1':
        # Python-style indices: start included, end not included
        start += -2
        end += 1
    elif dataset == 'ds2':
        start += 2
        end += 5
    elif dataset == 'ds3':
        start += 0
        end += 3
    elif dataset == 'ds4':
        start += 0
        end += 2
    else:
        raise ValueError('Invalid dataset label')
    
    start = max(0, start)
    end = min(len(y), end)
    if end - start > 0:
        y[start:end] = 1
    
    # y[index] = 1


def read_X_y(path_to_wav, path_to_truth, dataset, truth_format):
    X_part, length_seconds = read_X(path_to_wav)
    if X_part is not None:
        if truth_format == 'xml':
            y_part = read_y_xml(path_to_truth, length_seconds, dataset)
        elif truth_format == 'csv':
            y_part = read_y_csv(path_to_truth, length_seconds, dataset)
        else:
            raise ValueError('Unknown truth format')
        
        if X_part.shape[0] != y_part.shape[0]:
            raise ValueError('X_part vs. y_part shape mismatch: ' + str(X_part.shape[0]) + ' != ' + str(y_part.shape[0]))
        return X_part, y_part
    else:
        return None, None

In [3]:
# TODO dataset einchecken, damit aenderungen zentral gemacht werden (anderes, privates repo)
active_datasets = {'ds1', 'ds2', 'ds3', 'ds4'}
print('Active datasets: ' + str(active_datasets))

dir_tuples = []
if 'ds1' in active_datasets:
    path_to_ds_1 = r'data\IDMT-SMT-GUITAR_V2\dataset1'
    for guitar_desc in listdir(path_to_ds_1):
        dir_tuples.append((
            os.path.join(path_to_ds_1, guitar_desc, 'audio'),
            os.path.join(path_to_ds_1, guitar_desc, 'annotation'),
            'ds1',
        ))

if 'ds2' in active_datasets:
    dir_tuples.append((
        r'data\IDMT-SMT-GUITAR_V2\dataset2\audio',
        r'data\IDMT-SMT-GUITAR_V2\dataset2\annotation',
        'ds2',
    ))
if 'ds3' in active_datasets:
    dir_tuples.append((
        r'data\IDMT-SMT-GUITAR_V2\dataset3\audio',
        r'data\IDMT-SMT-GUITAR_V2\dataset3\annotation',
        'ds3',
    ))

file_tuples = []
for audio_dir, annotation_dir, ds in dir_tuples:
    for wav_file in listdir(audio_dir):
        path_to_wav = os.path.join(audio_dir, wav_file)
        if wav_file.endswith('.wav'):
            path_to_xml = os.path.join(annotation_dir, wav_file.replace('.wav', '.xml'))
            if isfile(path_to_xml):
                file_tuples.append((path_to_wav, path_to_xml, ds, 'xml'))
            else:
                warn('No truth found for ' + wav_file + ', skipping file.')
        else:
            warn('Skipping non-wav file ' + path_to_wav)

if 'ds4' in active_datasets:
    for path_to_ds in [r'data\IDMT-SMT-GUITAR_V2\dataset4\Career SG', r'data\IDMT-SMT-GUITAR_V2\dataset4\Ibanez 2820']:
        for tempo in listdir(path_to_ds):
            path_to_tempo = os.path.join(path_to_ds, tempo)
            for genre in listdir(path_to_tempo):
                path_to_genre = os.path.join(path_to_tempo, genre)
                path_to_audio = os.path.join(path_to_genre, 'audio')
                for wav_file in listdir(path_to_audio):
                    path_to_wav = os.path.join(path_to_audio, wav_file)
                    if wav_file.endswith('.wav'):
                        path_to_onsets = os.path.join(path_to_genre, 'annotation', 'onsets')
                        if isdir(path_to_onsets):
                            path_to_csv = os.path.join(path_to_onsets, wav_file.replace('.wav', '.csv'))
                            if isfile(path_to_csv):
                                file_tuples.append((path_to_wav, path_to_csv, 'ds4', 'csv'))
                            else:
                                # TODO fallback to other formats
                                warn('Skipping ' + path_to_wav + ': no truth csv')
                        else:
                            warn('Skipping ' + path_to_wav + ': no onset folder')
                    else:
                        warn('Skipping non-wav file ' + path_to_wav)

X = []
y = []
for path_to_wav, path_to_truth, dataset, truth_format in file_tuples:
    X_part, y_part = read_X_y(path_to_wav, path_to_truth, dataset, truth_format)
    if X_part is not None and y_part is not None:
        X.append(X_part)
        y.append(y_part)

X = np.concatenate(X)
y = np.concatenate(y)
y = y.ravel()
print(X.shape)
print(y.shape)
print(sum(y))

Active datasets: {'ds1', 'ds3', 'ds4', 'ds2'}




(1171926, 111)
(1171926,)
51536


ds1: (97328, 111)  
ds2: (345916, 111)  
ds3: (5538, 111)  
ds4: (723144, 111)  

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# bound = int(len(X)*0.8)
# X_train = X[:bound, :]
# X_test = X[bound:, :]
# y_train = y[:bound]
# y_test = y[bound:]

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(937540, 111)
(937540,)
(234386, 111)
(234386,)


In [5]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

<br>
<br>
<br>
<br>

In [None]:
onsets = [i for i in range(len(y)) if y[i] == 1]
print(len(onsets))
# print(onsets)

In [None]:
def plot_frame(frame, frame_size):
    fig, ax = plt.subplots(figsize=(4, 4))
    _ = ax.plot(range(len(frame)), frame)
    
    vertical_line_x = frame_size
    while vertical_line_x < len(frame):
        ax.axvline(x=vertical_line_x, color='red')
        vertical_line_x += frame_size

In [None]:
shuffle(onsets)
onsets_part = onsets[:20]
print(onsets_part)

In [None]:
for i in onsets_part:
    plot_frame(np.ravel(X[i-2:i+5, :]), X.shape[1])

Analyse versch. Datasets, Onset Frames, 441 Samples / Frame = 100 Hz

format:
effektiver onset / erster ausschlag
wenn nur ein wert: erster ausschlag

ds1:
[87238, 54493, 58755, 23774, 55508, 48497, 33519, 57751, 90489, 50501]
-2 / 0
-2 / 0
? / -2
? / -2
? / ?
? / -2
-2 / 0
? / ?
-2 / 0
? / -2

uneinheitlich annotiert, z.t. deutlich zu spät

--> -2 bis 0

ds2:
[298003, 321707, 21606, 311640, 123847, 149672, 4183, 42441, 81900, 164420]
3/3
3/4
0/2
0/3
1/4
1/3
0/4
2/3
0/4
1/3

--> 2 bis 4

ds3:
[1605, 1950, 2537, 2611, 1777, 4485, 437, 2983, 2686, 1263]
-4/0
-1/0
0/2
1/2
?/0
?/1
0/1
0/0
1/2
?/0

--> 0 bis 2

ds4:
[219738, 149302, 95957, 657513, 667847, 653544, 654901, 698624, 244043, 340672, 486978, 14403, 553750, 638052, 454921, 627638, 152653, 404285, 485007, 21380]
?/?
?/0
0
?
1
?
?
?
1
1
0
0
0
-2
?
?

--> 0 bis 1

In [None]:
# subsampling
plot_frame(X[i+6])
plot_frame(X[i+6][::4])

conclusions (dataset 2, 100 Hz frames):<br>
fruehster onset: ab frame 2<br>
spaetester onset: bis und mit frame 5<br>
4x subsampling scheint noch zu passen<br>

min: ab frame 2 (882)
max: bis und mit frame 5 (2646)
4x subsampling scheint noch zu passen

aktuell: 0-440
neu: 441-2204 (4x)
auch möglich: 50 Hz, 882-1763 oder 882-2646 oder schon ab 0
oder: überschneidend
<br>
<br>
<br>
<br>

In [8]:
clf = RandomForestClassifier(n_jobs=-1, n_estimators=30)
clf.fit(X_train, y_train)
y_train_predicted = clf.predict(X_train)
y_test_predicted = clf.predict(X_test)

In [None]:
# ds2
# y: nur frame 0
print(classification_report(y_train, y_train_predicted))
print(classification_report(y_test, y_test_predicted))

In [None]:
# ds2
# y: frame 1, 2, 3, 4
print(classification_report(y_train, y_train_predicted))
print(classification_report(y_test, y_test_predicted))

In [None]:
# ds2
# y: frame 1, 2, 3, 4
# subsampling: 4x
print(classification_report(y_train, y_train_predicted))
print(classification_report(y_test, y_test_predicted))

In [None]:
# ds1 + ds2
# y: frame 1, 2, 3, 4
# subsampling: 4x
print(classification_report(y_train, y_train_predicted))
print(classification_report(y_test, y_test_predicted))

In [None]:
# ds1 + ds2
# y: depending on the dataset
# subsampling: 4x
print(classification_report(y_train, y_train_predicted))
print(classification_report(y_test, y_test_predicted))

In [None]:
# ds1 + ds2 + ds3
# y: frame 1, 2, 3, 4
# subsampling: 4x
print(classification_report(y_train, y_train_predicted))
print(classification_report(y_test, y_test_predicted))

In [7]:
# ds1 + ds2 + ds3 + ds4
# y: depending on the dataset
# subsampling: 4x
print(classification_report(y_train, y_train_predicted))
print(classification_report(y_test, y_test_predicted))

             precision    recall  f1-score   support

          0       0.99      1.00      1.00    896320
          1       1.00      0.85      0.92     41220

avg / total       0.99      0.99      0.99    937540

             precision    recall  f1-score   support

          0       0.96      1.00      0.98    224070
          1       0.75      0.10      0.18     10316

avg / total       0.95      0.96      0.94    234386



In [9]:
# ds1 + ds2 + ds3 + ds4
# y: depending on the dataset
# subsampling: 4x
# n_estimators=30
print(classification_report(y_train, y_train_predicted))
print(classification_report(y_test, y_test_predicted))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00    896320
          1       1.00      0.97      0.99     41220

avg / total       1.00      1.00      1.00    937540

             precision    recall  f1-score   support

          0       0.96      1.00      0.98    224070
          1       0.86      0.11      0.19     10316

avg / total       0.96      0.96      0.94    234386

