# <span style="font-family:Times New Roman">Code 0. Получение данных</span> 

In [1]:
import numpy as np
import pandas as pd
import os
import glob
import pickle
import multiprocessing
from functools import partial

## <span style="font-family:Times New Roman">Получаем адреса файлов и дополнительные данные</span> 

Указываем путь к папке с файлами из набора `GTZAN`

In [2]:
path = 'C:\\Users\\micha\\OneDrive\\Рабочий стол\\Курсовая\\genres_original\\'

Получаем адреса всех файлов из набора `GTZAN`

In [3]:
files = glob.glob(path + "*/*.wav")

Получаем список названий десяти жанров на английском языке. \
Сохраняем список в текстовый файл для дальнейшей загрузки:

In [4]:
genre_names = os.listdir(path)
with open('genre_names.txt', 'w') as f:
    f.write(';'.join(genre_names))

Сохраняем список жанров на русском языке в текстовый файл для дальнейшей загрузки:

In [5]:
with open('genre_names_rus.txt', 'w') as f:
    f.write(';'.join(['Блюз', 'Классика', 'Кантри', 'Диско', 'Хип-Хоп', 'Джаз', 'Метал', 'Поп', 'Регги', 'Рок']))

Получаем словарь названий жанров и их порядковых номеров.\
Сохраняем словарь в файл формата `pickle` для дальнейшей загрузки:

In [6]:
genre_codes = dict(zip(genre_names, np.arange(0, 10))) # нумерация жанров для задачи классификации
with open('genre_codes.pkl', 'wb') as f:
    pickle.dump(genre_codes, f)

## <span style="font-family:Times New Roman">Создаём файл $\text{get_features.py}$</span>


Структура файла:

__1\.__ Функция для получения названия композиции
```python
def get_song_name(path, n_of_slice)
```
__2\.__ Функция для получения жанра композиции
```python
def get_song_genre(path)
```
__3\.__ Функция для получения набора характеристик `№1`
```python
def get_song_features_set_1(path)
```
– 22 параметра без деления, shape=(1000, 22)\
– Возвращает словарь рассчитанных характеристик

__4\.__ Функция для получения набора характеристик `№2`
```python
def get_song_features_set_2(path, test)
```
– 22 параметра по 3 / 5 делений, shape=(3000 / 5000, 22)\
– Возвращает словарь рассчитанных характеристик

__5\.__ Функция для получения набора характеристик `№3`
```python
def get_song_features_set_3(source_path, json_path)
```
– 10 делений, 125 списков по 5 параметров в каждом, shape=(10 000, 125, 5)\
– Формируется файл формата .json

In [1]:
%%file get_features.py
import librosa
import re
import numpy as np
import os
import json

def get_song_name(path, n_slice=None, file_format=True):
    song_name = path.replace('/', '\\').split('\\')[-1:][0]
    song_name_words = song_name.split('.wav')
    if n_slice == None:
        return song_name_words[0] + '.wav' * file_format
    else:
        return song_name_words[0] + '_part_' + str(n_slice + 1) + '.wav' * file_format

    
def get_song_genre(path):
    path = path.replace('/', '\\')
    return re.compile('.*\\\\(.*)\..*.wav').findall(path)[0]


def get_song_features_set_1(path):
    y, sr = librosa.load(path)
    features = dict()
    
    features['spectral_centroid_mean'] = librosa.feature.spectral_centroid(y=y, sr=sr).mean()
    features['spectral_centroid_std'] = librosa.feature.spectral_centroid(y=y, sr=sr).std()
    features['spectral_rolloff_mean'] = librosa.feature.spectral_rolloff(y=y, sr=sr).mean()
    features['spectral_rolloff_std'] = librosa.feature.spectral_rolloff(y=y, sr=sr).std()
    features['spectral_bandwidth_mean'] = librosa.feature.spectral_bandwidth(y=y, sr=sr).mean()
    features['spectral_bandwidth_std'] = librosa.feature.spectral_bandwidth(y=y, sr=sr).std()
    features['spectral_flux_mean'] = librosa.onset.onset_strength(y=y, sr=sr).mean()
    features['spectral_flux_std'] = librosa.onset.onset_strength(y=y, sr=sr).std()    
    features['zero_crossings_mean'] = librosa.feature.zero_crossing_rate(y=y).mean()
    features['zero_crossings_std'] = librosa.feature.zero_crossing_rate(y=y).std()
    features['tempo'] = librosa.feature.tempo(y=y)[0]

    rms = librosa.feature.rms(y=y)
    threshold = np.mean(rms)
    low_energy = sum(rms[0] < threshold) / len(rms[0])
    features['low_energy'] = low_energy

    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    for i in range(5):
        features['mfcc_' + str(i+1) + '_mean'] = mfcc[i].mean()
        features['mfcc_' + str(i+1) + '_std'] = mfcc[i].std()

    return features


def get_song_features_set_2(path, n_slices):
    duration = 29
    y, sr = librosa.load(path, duration=duration)
    track_features = dict()
    samples_per_slice = int(duration * sr / n_slices)

    for slice_ in range(n_slices):
        start_sample = samples_per_slice * slice_
        end_sample = start_sample + samples_per_slice
        y_slice = y[start_sample:end_sample]
        
        features = dict()
        features['spectral_centroid_mean'] = librosa.feature.spectral_centroid(y=y_slice, sr=sr).mean()
        features['spectral_centroid_std'] = librosa.feature.spectral_centroid(y=y_slice, sr=sr).std()
        features['spectral_rolloff_mean'] = librosa.feature.spectral_rolloff(y=y_slice, sr=sr).mean()
        features['spectral_rolloff_std'] = librosa.feature.spectral_rolloff(y=y_slice, sr=sr).std()
        features['spectral_bandwidth_mean'] = librosa.feature.spectral_bandwidth(y=y_slice, sr=sr).mean()
        features['spectral_bandwidth_std'] = librosa.feature.spectral_bandwidth(y=y_slice, sr=sr).std()
        features['spectral_flux_mean'] = librosa.onset.onset_strength(y=y_slice, sr=sr).mean()
        features['spectral_flux_std'] = librosa.onset.onset_strength(y=y_slice, sr=sr).std()   
        features['zero_crossings_mean'] = librosa.feature.zero_crossing_rate(y=y_slice).mean()
        features['zero_crossings_std'] = librosa.feature.zero_crossing_rate(y=y_slice).std()
        features['tempo'] = librosa.feature.tempo(y=y_slice)[0]

        rms = librosa.feature.rms(y=y_slice)
        threshold = np.mean(rms)
        low_energy = sum(rms[0] < threshold) / len(rms[0])
        features['low_energy'] = low_energy

        mfcc = librosa.feature.mfcc(y=y_slice, sr=sr)
        for i in range(5):
            features['mfcc_' + str(i+1) + '_mean'] = mfcc[i].mean()
            features['mfcc_' + str(i+1) + '_std'] = mfcc[i].std()
        
        features['genre'] = get_song_genre(path)
        song_name = get_song_name(path, slice_)
        
        track_features[song_name] = features
    return track_features


def get_song_features_set_3(source_path, json_path):
    mydict = {"labels": [], "features": []}
    duration = 29
    n_slices = 10
   
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(source_path)):
        for file in filenames:
            song, sr = librosa.load(os.path.join(dirpath, file), duration=duration)
            samples_per_slice = int(duration * sr / n_slices)
            for s in range(n_slices):
                start_sample = samples_per_slice * s
                end_sample = start_sample + samples_per_slice
                mfcc = librosa.feature.mfcc(y=song[start_sample:end_sample], sr=sr, n_mfcc=5)
                mfcc = mfcc.T
                mydict["labels"].append(i-1)
                mydict["features"].append(mfcc.tolist())
   
    with open(json_path, 'w') as f:
        json.dump(mydict, f)
    f.close()

Overwriting get_features.py


Импортируем функции из файла `get_features`:

In [7]:
from get_features import *

##  <span style="font-family:Times New Roman">Получаем набор данных №1</span>

Используем `multiprocessing.Pool()`, чтобы увеличить производительность

In [10]:
%%time
with multiprocessing.Pool() as pool:
    list_of_features_dicts = pool.map(get_song_features_set_1, files)

Wall time: 3min 46s


Получаем список названий файлов с помощью функции `get_song_name`:

In [11]:
%%time
file_names = list(map(get_song_name, files))

Wall time: 992 µs


Получаем список жанров файлов с помощью функции `get_song_genre`:

In [19]:
%%time
file_genres = list(map(get_song_genre, files))

Wall time: 1.49 ms


Создаём DataFrame из полученных наборов данных:

In [21]:
features_set_1 = pd.DataFrame(list_of_features_dicts, index=file_names)
features_set_1['genre'] = pd.factorize(file_genres)[0]
features_set_1.head()

Unnamed: 0,spectral_centroid_mean,spectral_centroid_std,spectral_rolloff_mean,spectral_rolloff_std,spectral_bandwidth_mean,spectral_bandwidth_std,spectral_flux_mean,spectral_flux_std,zero_crossings_mean,zero_crossings_std,...,mfcc_1_std,mfcc_2_mean,mfcc_2_std,mfcc_3_mean,mfcc_3_std,mfcc_4_mean,mfcc_4_std,mfcc_5_mean,mfcc_5_std,genre
blues.00000.wav,1784.122641,360.202005,3805.72303,949.343413,2002.412407,292.975102,1.391777,1.481487,0.083045,0.027694,...,50.688946,121.570671,17.200205,-19.162262,15.348761,42.363937,12.289782,-6.362266,12.961206,0
blues.00001.wav,1530.261767,613.11949,3550.713616,1725.778347,2038.987608,462.49876,1.445264,2.002827,0.05604,0.038046,...,88.142525,123.985138,23.662489,8.947019,23.923552,35.867149,16.270117,2.909594,16.732485,0
blues.00002.wav,1552.832481,395.564168,3042.410115,885.511646,1747.754087,276.216244,1.638602,1.94984,0.076291,0.031731,...,57.601101,140.440872,22.55784,-29.084547,20.29937,31.686693,11.998093,-13.976547,12.476432,0
blues.00003.wav,1070.153418,429.378632,2184.879029,1221.915647,1596.422565,408.107638,1.248168,1.749711,0.033309,0.020561,...,74.217697,150.086105,21.361393,5.663404,16.034643,26.855282,12.584162,1.770071,16.369904,0
blues.00004.wav,1835.128513,585.874983,3579.957471,1253.928347,1748.410759,297.285561,1.645293,1.963796,0.101461,0.044205,...,72.104813,126.20948,29.210808,-35.581394,18.276552,22.139256,13.919527,-32.473549,18.341904,0


Сохраняем DataFrame в csv файл:

In [24]:
features_set_1.to_csv('features_set_1.csv')

##  <span style="font-family:Times New Roman">Получаем набор данных №2.1.</span>

Используем `multiprocessing.Pool()`, чтобы увеличить производительность,\
Используем `functools.partial()`, чтобы передать параметр `n_slices` в функцию:

In [13]:
%%time
with multiprocessing.Pool(processes=12) as pool:
    list_of_features_dicts = pool.map(partial(get_song_features_set_2, n_slices=3), files)

Wall time: 5min 3s


Создаём словарь `features_dict`, который в цикле обновляется строками из ранее полученного набора данных:

In [16]:
%%time
features_dict = dict()

for d in list_of_features_dicts:
    features_dict.update(d)

Wall time: 0 ns


Создаём DataFrame из полученного наборов данных:

In [17]:
features_set_2_1 = pd.DataFrame(dict(features_dict)).T
features_set_2_1['genre'] = pd.factorize(features_set_2_1['genre'])[0]
features_set_2_1.head()

Unnamed: 0,spectral_centroid_mean,spectral_centroid_std,spectral_rolloff_mean,spectral_rolloff_std,spectral_bandwidth_mean,spectral_bandwidth_std,spectral_flux_mean,spectral_flux_std,zero_crossings_mean,zero_crossings_std,...,mfcc_1_std,mfcc_2_mean,mfcc_2_std,mfcc_3_mean,mfcc_3_std,mfcc_4_mean,mfcc_4_std,mfcc_5_mean,mfcc_5_std,genre
blues.00000_part_1.wav,1790.813935,342.704141,3859.19409,921.202597,2025.304397,294.144252,1.403349,1.503985,0.08039,0.02688,...,52.330929,119.797943,15.516554,-18.328676,16.17243,44.792126,13.987721,-5.382723,12.255576,0
blues.00000_part_2.wav,1766.792144,397.957503,3826.842647,983.443633,2021.694769,283.675946,1.360021,1.475419,0.078798,0.026847,...,49.033913,123.604179,19.870399,-16.065786,12.937943,41.457764,11.908422,-10.554242,11.151757,0
blues.00000_part_3.wav,1810.547646,347.091544,3802.908259,942.628937,1991.564066,293.89906,1.382695,1.450729,0.087823,0.028691,...,47.88345,123.225273,15.683563,-24.075006,15.49273,42.877445,11.143267,-7.042732,14.27177,0
blues.00001_part_1.wav,1373.218531,550.467877,3001.53229,1581.349384,1846.732466,475.537656,1.511111,2.243689,0.055316,0.039842,...,97.679276,130.28154,24.444275,0.623084,33.270702,39.474735,15.110327,7.623341,15.924262,0
blues.00001_part_2.wav,1562.01714,675.268785,3702.394159,1806.416041,2100.823489,463.457559,1.454818,2.140995,0.053028,0.037683,...,85.392136,125.441933,24.618567,10.299148,18.295586,37.888237,18.626686,0.460019,15.620192,0


In [18]:
features_set_2_1.shape

(3000, 23)

Сохраняем DataFrame в csv файл:

In [19]:
features_set_2_1.to_csv('features_set_2_1.csv')

##  <span style="font-family:Times New Roman">Получаем набор данных №2.2.</span>

Используем `multiprocessing.Pool()`, чтобы увеличить производительность,\
Используем `functools.partial()`, чтобы передать параметр `n_slices` в функцию:

In [11]:
%%time
with multiprocessing.Pool() as pool:
    list_of_features_dicts = pool.map(partial(get_song_features_set_2, n_slices=5), files)

Wall time: 3.05 s


In [None]:
features_dict = dict()

for d in list_of_features_dicts:
    features_dict.update(d)

In [None]:
features_set_2_2 = pd.DataFrame(dict(features_dict)).T
features_set_2_2['genre'] = pd.factorize(features_set_2_2['genre'])[0]
features_set_2_2.head()

In [None]:
features_set_2_2.to_csv('features_set_2_2.csv')

##  <span style="font-family:Times New Roman">Получаем набор данных №3</span>

In [10]:
%%time
json_path = 'C:\\Users\\micha\\OneDrive\\Рабочий стол\\Курсовая\\features_set_3.json'

get_song_features_set_3(path, json_path)

Wall time: 1min 17s


___ 