In [1]:
import librosa
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier

In [2]:
model = CatBoostClassifier()
model.load_model('model.pickle')
classes = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']

In [3]:
def extract_features(audio_file, sr):
    features = {}
    
    chroma = librosa.feature.chroma_stft(y=audio_file, sr=sr)
    features['chroma_stft_mean'] = [np.mean(chroma)]
    features['chroma_stft_var'] = [np.var(chroma)]

    rms = librosa.feature.rms(y=audio_file)
    features['rms_mean'] = [np.mean(rms)]
    features['rms_var'] = [np.var(rms)]

    spectral_centroid = librosa.feature.spectral_centroid(y=audio_file, sr=sr)
    features['spectral_centroid_mean'] = [np.mean(spectral_centroid)]
    features['spectral_centroid_var'] = [np.var(spectral_centroid)]
    
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio_file, sr=sr)
    features['spectral_bandwidth_mean'] = [np.mean(spectral_bandwidth)]
    features['spectral_bandwidth_var'] = [np.var(spectral_bandwidth)]
    
    rolloff = librosa.feature.spectral_rolloff(y=audio_file, sr=sr)
    features['rolloff_mean'] = [np.mean(rolloff)]
    features['rolloff_var'] = [np.var(rolloff)]
    
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y=audio_file)
    features['zero_crossing_rate_mean'] = [np.mean(zero_crossing_rate)]
    features['zero_crossing_rate_var'] = [np.var(zero_crossing_rate)]

    y_harmonic, y_percussive = librosa.effects.hpss(audio_file)
    features['harmony_mean'] = [np.mean(y_harmonic)]
    features['harmony_var'] = [np.var(y_harmonic)]
    features['perceptr_mean'] = [np.mean(y_percussive)]
    features['perceptr_var'] = [np.var(y_percussive)]
    
    features['tempo'] = librosa.feature.tempo(y=audio_file, sr = sr)[0]
    
    mfccs = librosa.feature.mfcc(y=audio_file, sr=sr)
    for i, mfcc in enumerate(mfccs):
        features[f'mfcc{i+1}_mean'] = [np.mean(mfcc)]
        features[f'mfcc{i+1}_var'] = [np.var(mfcc)]

    return pd.DataFrame.from_dict(features)

In [4]:
def predict(model, audio_file, sr):
    data = extract_features(audio_file, sr)
    pred=model.predict(data)
    return pred[0]

In [5]:
y, sr = librosa.load('data/genres_original/classical/classical.00001.wav')

In [6]:
audio_file, _ = librosa.effects.trim(y)

In [7]:
votes = np.zeros(10)

chunk_len = sr*3 # unit count per 3 seconds
chunk_count = len(audio_file)//chunk_len
for chunk_idx in range(chunk_count-1):
    audio_chunk = audio_file[chunk_idx*chunk_len:(chunk_idx+1)*chunk_len]
    label = predict(model, audio_chunk, sr)
    votes[label] += 1
last_chunk = audio_file[(chunk_count-1)*chunk_len:]
label = predict(model, last_chunk, sr)
votes[label] += 1

print(classes[np.argmax(votes)])

rock


In [8]:
votes

array([1., 0., 0., 0., 0., 0., 1., 0., 1., 7.])

In [10]:
extract_features(audio_file, sr)

Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc16_mean,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var
0,0.214977,0.085011,0.03061,0.000161,1361.006486,61987.100557,1441.739951,32687.848035,2389.011463,371106.560548,...,-3.125152,158.625076,-4.289578,75.69735,0.594025,134.63237,2.877037,153.62085,0.374558,123.758858
