In [None]:
import csv, librosa, os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

path = "/home/bhm-ai/music_classification"
genres = 'blues classical country disco hiphop jazz metal pop reggae rock'.split()
songname = f'{path}/Data/genres_original/jazz/jazz.00029.wav'

num_mfcc = 20
n_fft = 2048
hop_length = 512

# header = 'filename chroma_stft spectral_centroid spectral_bandwidth rolloff zero_crossing_rate'
# header = 'filename chroma_stft_mean chroma_stft_var spectral_centroid_mean spectral_centroid_var spectral_bandwidth_mean spectral_bandwidth_var rolloff_mean rolloff_var zero_crossing_rate_mean zero_crossing_rate_var'
header = 'filename chroma_stft_mean chroma_stft_var rms_mean rms_var spectral_centroid_mean spectral_centroid_var spectral_bandwidth_mean spectral_bandwidth_var rolloff_mean rolloff_var zero_crossing_rate_mean zero_crossing_rate_var harmony_mean harmony_var perceptr_mean perceptr_var tempo'
for i in range(1, 21):
    header += f' mfcc{i}_mean mfcc{i}_var'
header += ' label'
header = header.split()

demo = False 
pre_dataset = False 
# epochs = 900 if not demo else 5
epochs = 600 if not demo else 5

converter = LabelEncoder()

In [None]:
# Training the model using the following parameters
# metrics = accuracy
# epochs = 600
# loss = sparse_categorical_crossentropy
# batch_size = 256
# optimizer = adam

def train_model(model,epochs,optimizer):
    batch_size=256
    model.compile(optimizer=optimizer,loss='sparse_categorical_crossentropy',metrics=['accuracy'])
    return model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=epochs,batch_size=batch_size)

def Validation_plot(history):
    print("Validation Accuracy",max(history.history["val_accuracy"]))
    pd.DataFrame(history.history).plot(figsize=(12,6))
    plt.show()

def readdata(df):
    df = df.drop(labels="filename",axis=1)    
    
    fit = StandardScaler()
    X = fit.fit_transform(np.array(df.iloc[:,:-1],dtype=float))
    return X, df

# def extractfeature(y, sr, filename='_', g='_'):  # 60 columns; missing rms, harmony, perceptr, tempo

#     chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
#     spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
#     spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
#     rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
#     zcr = librosa.feature.zero_crossing_rate(y)
#     mfcc = librosa.feature.mfcc(y=y, sr=sr)
#     to_append = f'{filename} {np.mean(chroma_stft)} {np.var(chroma_stft)} {np.mean(spec_cent)} {np.var(spec_cent)} {np.mean(spec_bw)} {np.var(spec_bw)} {np.mean(rolloff)} {np.var(rolloff)} {np.mean(zcr)} {np.var(zcr)}'
#     for e in mfcc:
#         to_append += f' {np.mean(e)} {np.var(e)}'
#     to_append += f' {g}'
#     return to_append


def extractfeature(y, sr, filename='_', g='_'):
    chroma_hop_length = 512 #5000?
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=chroma_hop_length)
    RMSEn= librosa.feature.rms(y=y)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)
    harmony, perceptr = librosa.effects.hpss(y=y)
    mfcc = librosa.feature.mfcc(y=y,sr=sr, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length)
    tempo, _ = librosa.beat.beat_track(y=y, sr = sr)
    to_append = f'{filename} {chroma_stft.mean()} {chroma_stft.var()} {RMSEn.mean()} {RMSEn.var()} {spec_cent.mean()} {spec_cent.var()} {spec_bw.mean()} {spec_bw.var()} {rolloff.mean()} {rolloff.var()} {zcr.mean()} {zcr.var()} {harmony.mean()} {harmony.var()} {perceptr.mean()} {perceptr.var()} {tempo[0]}'
    # for e in mfcc:
    #     to_append += f' {np.mean(e)} {np.var(e)}'
    mfcc = mfcc.T
    for x in range(20):
        to_append += f' {mfcc[:,x].mean()} {mfcc[:,x].var()}'
    to_append += f' {g}'
    return to_append


def extractfeature_(y, sr, filename='_', g='_'):  # 27 columns 
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    to_append = f'{filename} {np.mean(chroma_stft)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)}'    
    for e in mfcc:
        to_append += f' {np.mean(e)}'
    to_append += f' {g}'
    return to_append


def big_cnn_model(train=True):
    # https://www.tensorflow.org/api_docs/python/tf/keras/models/load_model
    if not train:
        model = tf.keras.models.load_model("model.keras")
    else:
        # We used different layers to train the neural network by importing keras library from tensorflow framework 
        # for input and hidden neurons we use the most widly used activation function which is relu where as for output neurons we uses softmax activation function
        model = tf.keras.models.Sequential([
            tf.keras.layers.Flatten(input_shape=(X.shape[1],)),
            tf.keras.layers.Dropout(0.2),
            
            tf.keras.layers.Dense(512,activation='relu'),
            tf.keras.layers.Dropout(0.2),
            
            tf.keras.layers.Dense(256,activation='relu'),
            tf.keras.layers.Dropout(0.2),
            
            tf.keras.layers.Dense(128,activation='relu'),
            tf.keras.layers.Dropout(0.2),
            
            tf.keras.layers.Dense(64,activation='relu'),
            tf.keras.layers.Dropout(0.2),
            
            tf.keras.layers.Dense(32,activation='relu'),
            tf.keras.layers.Dropout(0.2),
            
            tf.keras.layers.Dense(10,activation='softmax'),
        ])
        
        optimizer = tf.keras.optimizers.Adam(learning_rate=0.000146)
        model.compile(optimizer=optimizer,
                     loss="sparse_categorical_crossentropy",
                      metrics=["accuracy"])
        model.summary()
        model_history=train_model(model=model, epochs=epochs, optimizer='adam')
        
        test_loss,test_acc=model.evaluate(X_test,y_test,batch_size=256)
        print("The test loss is ",test_loss)
        print("The best accuracy is: ",test_acc*100)
        
        Validation_plot(model_history)
        model.save("model.keras")
    return model


def cnn_model(train=True):
    # https://www.tensorflow.org/api_docs/python/tf/keras/models/load_model
    if not train:
        model = tf.keras.models.load_model("model.keras")
    else:
        # We used different layers to train the neural network by importing keras library from tensorflow framework 
        # for input and hidden neurons we use the most widly used activation function which is relu where as for output neurons we uses softmax activation function
        model = tf.keras.models.Sequential()
        model.add(tf.keras.layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)))
        model.add(tf.keras.layers.Dense(128, activation='relu'))
        model.add(tf.keras.layers.Dense(64, activation='relu'))
        model.add(tf.keras.layers.Dense(10, activation='softmax'))
        
        optimizer = tf.keras.optimizers.Adam(learning_rate=0.000146)
        model.compile(optimizer=optimizer,
                     loss="sparse_categorical_crossentropy",
                      metrics=["accuracy"])
        model.summary()
        model_history=train_model(model=model, epochs=epochs, optimizer='adam')
        
        test_loss,test_acc=model.evaluate(X_test,y_test,batch_size=256)
        print("The test loss is ",test_loss)
        print("The best accuracy is: ",test_acc*100)
        
        Validation_plot(model_history)
        model.save("model.keras")
    return model

## tiền xử lí dữ liệu 

In [None]:
def predat_():
    return pd.read_csv(path + "/Data/features_3_sec.csv")

def predat():
    if pre_dataset:
        file = open(f'{path}/try_running__features_3_sec.csv', 'w', newline='')
        with file:
            writer = csv.writer(file)
            writer.writerow(header)
        genres = 'blues classical country disco hiphop jazz metal pop reggae rock'.split()
        for g in genres:
            print(g)
            ldir = list(os.listdir(f'{path}/Data/genres_original/{g}'))
            listfn = ldir[:5] if demo else ldir
            for filename in listfn:
                songname = f'{path}/Data/genres_original/{g}/{filename}'
                y, sr = librosa.load(songname, mono=True, duration=30)
                to_append = extractfeature(y, sr, filename, g)
                file = open(f'{path}/try_running__features_3_sec.csv', 'a', newline='')
                with file:
                    writer = csv.writer(file)
                    writer.writerow(to_append.split())
    return pd.read_csv(f"{path}/try_running__features_3_sec.csv")

df = predat_()
df.head()

In [None]:
df.loc[df['filename'] == 'jazz.00029.wav']

In [None]:
print("Columns containing missing values",list(df.columns[df.isnull().any()]))

 # Label Encoding - encod the categorical classes with numerical integer values for training

# Blues - 0
# Classical - 1
# Country - 2
# Disco - 3
# Hip-hop - 4 
# Jazz - 5  
# Metal - 6 
# Pop - 7
# Reggae - 8
# Rock - 9
class_encod = df.iloc[:,-1]
Y = converter.fit_transform(class_encod)

X, df = readdata(df)
# df = df.drop(labels="length",axis=1)
X_train,X_test,y_train,y_test=train_test_split(X, Y, test_size=0.3)

In [None]:
print(list(df.isnull())[:5])

## CNN model

In [None]:
# model = cnn_model(False)  # cnn_model(True)  # 
model = big_cnn_model(True)  # big_cnn_model(False)  # big_cnn_model(True)  # 

trích xuất đặc trưng, lưu vào csv  # reference more at https://github.com/danyalimran93/Music-Emotion-Recognition/blob/master/Feature-Extraction.py

In [None]:
file = open('try_running__Extracted___Data.csv', 'w', newline='')
with file:
    writer = csv.writer(file)
    writer.writerow(header)

y, sr = librosa.load(songname, mono=True, duration=30)

In [None]:
to_append = extractfeature(y, sr, filename='_', g='_')
len(to_append)

In [None]:
file = open('try_running__Extracted___Data.csv', 'a', newline='')
with file:
    writer = csv.writer(file)
    writer.writerow(to_append.split())

df = pd.read_csv('try_running__Extracted___Data.csv')
# # assert len(list(df.columns[df.isnull().any()])) > 0
X, df = readdata(df)
# df = df.drop(labels="label",axis=1)
df.head()

In [None]:
pred_x = model.predict(X)
pred_ind = np.argmax(pred_x, axis=1)
# print(labels[pred_ind])
print(converter.inverse_transform(pred_ind)[:1])
# print(pred_ind)
# print(class_encod)