In [None]:
import librosa
import numpy as np
import matplotlib.pylab as plt
import librosa.display
from scipy.signal import find_peaks
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import IPython.display as ipd
from sklearn.metrics import classification_report
import os
from numpy import linalg as LA
import warnings
import pickle
warnings.simplefilter("ignore")
%matplotlib tk

## Basic Configuration

In [None]:
sr = 44100
n_fft = int(sr * (20/1000)) # frequency resolution for basic features
# n_fft = 441000
window_length = n_fft #frame size for basic features
hop_length = window_length // 2 #time resolution for basic features
segment_size = sr * 1 # 1 second will be used for segmenting and statistics on short time features

## Loading Audio Samples Files

In [None]:
files_names = os.listdir('./samples')
audio_files_paths = [ s for s in files_names if s.endswith('.mp3')]

In [None]:
def padding(audio_sample_labeling, frame_length, hop_length):
    pad_size = hop_length-((audio_sample_labeling.size-frame_length)%hop_length)
    if pad_size == hop_length: pad_size = 0
    return np.pad(audio_sample_labeling, (0, pad_size))

In [None]:
audios = {}
srs = {}
labels_csv = {}
for audio_file_name in audio_files_paths:
    print('loading audio file ' + audio_file_name + '...')
    audio_path = './samples/' + audio_file_name
    audios[audio_file_name], srs[audio_file_name] = librosa.load(audio_path, sr=None)
    audios[audio_file_name] = padding(audios[audio_file_name], segment_size, segment_size//4)
    print(f'loading the labels from csv')
    labels_csv[audio_file_name] = pd.read_csv(f'./samples/{audio_file_name[:-4]}.csv',header=None)

## Define training  and testing sets

In [None]:
total_size = 0
for audio in audios.values(): total_size+= len(audio)
train_size = total_size * 0.7
size = 0
train_audios_names = []
for key, value in audios.items():
    if size + len(value) <= train_size:
        size+= len(value)
        train_audios_names += [key]

print(f'Training data percentage is: {size * 100 /total_size}%')
print(f'Audio files used for training', train_audios_names)
test_audios_names = [key for key in audios.keys() if key not in train_audios_names]
print(f'Audio files used for testing', test_audios_names)


## Audios Sample Labeling

In [None]:
# We put the labels for each sample of all audios (based on labels csv) Labels: silence=0, music=1, speech=2, music&speech=3
def sample_labeling(labels_csv, audio):    
    labels = np.zeros(audio.size).astype(np.int32)
    for start, duration, label in labels_csv.values:
        start_, duration_ = np.floor([start*sr, duration*sr]).astype(np.int32)
        seg = labels[start_:start_+duration_]
        if label == 's': 
            labels[start_:start_+duration_] += 2
        elif label == 'm': 
            labels[start_:start_+duration_] += 1          
    return labels

# create sample labeling for all audios

In [None]:
training_sample_labeling = {}
testing_sample_labeling = {}
for key, audio in audios.items():
    if key in train_audios_names:
        training_sample_labeling[key] = sample_labeling(labels_csv[key], audio)
    elif key in test_audios_names:
        testing_sample_labeling[key] = sample_labeling(labels_csv[key], audio)



## Frame Labeling, normalized based on majority of samples in a frame

In [None]:
def frame_labeling(audio_sample_labeling, frame_length, hop_length, multi_label=False):
    def label_per_frame():
        start = 0
        while start+frame_length <= audio_sample_labeling.size:
            frame = audio_sample_labeling[start:start+frame_length]
            if multi_label: 
                labels, counts = np.unique(frame, return_counts=True)
                yield labels[np.argmax(counts)]
            else:   
                speech_count = np.sum(frame >= 2)
                other_count = np.sum((frame < 2) & (frame == 3))
                yield 1 if speech_count > other_count else 0
            start += hop_length
    return list(label_per_frame())

In [None]:
training_frame_labeling = {}
testing_frame_labeling = {}
for key, audio in audios.items():
    if key in train_audios_names:
        training_frame_labeling[key] = frame_labeling(training_sample_labeling[key], segment_size, segment_size//4)
    elif key in test_audios_names:
        testing_frame_labeling[key] = frame_labeling(testing_sample_labeling[key], segment_size, segment_size//4)


## Save Basic Variable Jsons in hard disk 

In [None]:
labling_type = 'bi_label'

In [None]:
# we are doing this in order to load the audios and other calculated variables quickly later
with open('pickles/audios.pickle', 'wb') as handle:
    pickle.dump(audios, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('pickles/srs.pickle', 'wb') as handle:
    pickle.dump(srs, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('pickles/train_audios_names.pickle', 'wb') as handle:
    pickle.dump(train_audios_names, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('pickles/test_audios_names.pickle', 'wb') as handle:
    pickle.dump(test_audios_names, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(f'pickles/{labling_type}/training_sample_labeling.pickle', 'wb') as handle:
    pickle.dump(training_sample_labeling, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(f'pickles/{labling_type}/testing_sample_labeling.pickle', 'wb') as handle:
    pickle.dump(testing_sample_labeling, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(f'pickles/{labling_type}/training_frame_labeling.pickle', 'wb') as handle:
    pickle.dump(training_frame_labeling, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(f'pickles/{labling_type}/testing_frame_labeling.pickle', 'wb') as handle:
    pickle.dump(testing_frame_labeling, handle, protocol=pickle.HIGHEST_PROTOCOL)

