In [1]:
import pandas as pd
import numpy as np
import boto3
import io


In [2]:
!conda install -c conda-forge librosa -y

Collecting package metadata (current_repodata.json): done
Solving environment: / 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:
- 
  - conda-forge/noarch::seaborn-base==0.11.1=pyhd8ed1ab_1
  - conda-forge/noarch::nbclassic==0.2.6=pyhd8ed1ab_0
  - conda-forge/noarch::typing-extensions==3.7.4.3=0
  - conda-forge/linux-64::pluggy==0.13.1=py36h5fab9bb_4
  - conda-forge/linux-64::blaze==0.11.3=py36_0
  - conda-forge/linux-64::matplotlib==3.3.4=py36h5fab9bb_0
  - defaults/linux-64::_anaconda_depends==5.1.0=py36_2
  - conda-forge/noarch::python-language-server==0.36.2=pyhd8ed1ab_0
  - conda-forge/noarch::jupyterlab_server==2.3.0=pyhd8ed1ab_0
  - conda-forge/noarch::pyls-black==0.4.6=pyh9f0ad1d_0
  - conda-forge/linux-64::scikit-image==0.16.2=py36hb3f55d8_0
  - conda-forge/noarch::path.py==12.5.0=0
  - conda-forge/noarch::qdarkstyle==2.8.1=pyhd8ed1ab_2
  - conda-forge/noarch::ipywidgets==7.6.3=pyhd3deb0d_0
  - con

In [3]:
import librosa


In [4]:
data_dir = 's3://musicgenredatalake/'

genres = ['blues', 'classical', 'country', 'disco',
          'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
window_size = 3

In [5]:
columns = ['filename', 'length', 'chroma_stft_mean', 'chroma_stft_var', 'rms_mean',
           'rms_var', 'spectral_centroid_mean', 'spectral_centroid_var',
           'spectral_bandwidth_mean', 'spectral_bandwidth_var', 'rolloff_mean',
           'rolloff_var', 'zero_crossing_rate_mean', 'zero_crossing_rate_var',
           'harmony_mean', 'harmony_var', 'perceptr_mean', 'perceptr_var', 'tempo',
           'mfcc1_mean', 'mfcc1_var', 'mfcc2_mean', 'mfcc2_var', 'mfcc3_mean',
           'mfcc3_var', 'mfcc4_mean', 'mfcc4_var', 'mfcc5_mean', 'mfcc5_var',
           'mfcc6_mean', 'mfcc6_var', 'mfcc7_mean', 'mfcc7_var', 'mfcc8_mean',
           'mfcc8_var', 'mfcc9_mean', 'mfcc9_var', 'mfcc10_mean', 'mfcc10_var',
           'mfcc11_mean', 'mfcc11_var', 'mfcc12_mean', 'mfcc12_var', 'mfcc13_mean',
           'mfcc13_var', 'mfcc14_mean', 'mfcc14_var', 'mfcc15_mean', 'mfcc15_var',
           'mfcc16_mean', 'mfcc16_var', 'mfcc17_mean', 'mfcc17_var', 'mfcc18_mean',
           'mfcc18_var', 'mfcc19_mean', 'mfcc19_var', 'mfcc20_mean', 'mfcc20_var', 'label']


In [6]:
def extract_features(window):
    ''' Pendiente documentar '''

    chromogram = librosa.feature.chroma_stft(window)
    rms = librosa.feature.rms(window)
    centroid = librosa.feature.spectral_centroid(window)
    bandwidth = librosa.feature.spectral_bandwidth(window)
    roll_off = librosa.feature.spectral_rolloff(window)
    zero_crossing_rate = librosa.feature.zero_crossing_rate(window)
    harmony, perceptual = librosa.effects.hpss(window)
    tempo, _ = librosa.beat.beat_track(window)
    mfcc = librosa.feature.mfcc(window)

    mfcc_values = []
    for mean, var in zip(mfcc.mean(axis=1), np.var(mfcc, axis=1)):
        mfcc_values.append(mean)
        mfcc_values.append(var)

    return [
        chromogram.mean(),
        np.var(chromogram),
        rms.mean(),
        np.var(rms),
        centroid.mean(),
        np.var(centroid),
        bandwidth.mean(),
        np.var(bandwidth),
        roll_off.mean(),
        np.var(roll_off),
        zero_crossing_rate.mean(),
        np.var(zero_crossing_rate),
        harmony.mean(),
        np.var(harmony),
        perceptual.mean(),
        np.var(perceptual),
        tempo,
        *mfcc_values
    ]


In [7]:
bucket = 'musicgenredatalake'
client = boto3.client('s3')
paginator = client.get_paginator('list_objects_v2')

In [8]:

features = []
for genre in genres:
    page_iterator = paginator.paginate(Bucket=bucket,Prefix='raw/songs/'+f'{genre}'+'/')    
    for files in page_iterator:
        for file_content in files['Contents']:            
            obj = client.get_object(Bucket=bucket, Key=file_content['Key'])
            data = io.BytesIO(obj['Body'].read())                 
            try:
                #signal, sample_rate = librosa.load(f'{data_dir}/file_content['Key']')
                signal, sample_rate = librosa.load(data)
                
                # Get rid of silence at the begining and end
                signal, _ = librosa.effects.trim(signal)
                n_points = window_size * sample_rate
                for i in range(int(len(signal) / n_points)):
                    print(genre, file_content['Key'], i, end='\r')
                    window = signal[i * n_points:(i + 1) * n_points]
                    if len(window):
                        values = extract_features(window)
                        values = [file_content['Key'].replace('.wav', f'.{i}.wav'), len(
                            window), *values, genre]
                        features.append(values)
            except Exception as ex:
                print(ex)
                print(file_content['Key'])                


    

Error opening <_io.BytesIO object at 0x7f19f987f990>: Format not recognised.
raw/songs/jazz/jazz.00054.wav
3000 secondsgs/rock/rock.00099.wav 9.wav 9


In [9]:
df = pd.DataFrame(features)
df.columns = columns

In [28]:
df.to_csv('s3://musicgenredatalake/standardized/dataset/features_3_sec.csv', index=False)
