## Importing Libraries

In [None]:
import librosa
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
import csv
import base64
from IPython.display import HTML
# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import sklearn.model_selection as skms
#Keras
import keras
from keras import models
from keras import layers
from tensorflow import keras
from warnings import filterwarnings
for dirname, _, filenames in os.walk('Dataset/Data/genres_original/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Feature Extraction

First we take the address of the dataset storage

In [None]:
general_path = '../input/gtzan-genre-collection'
print(list(os.listdir(f'{general_path}/genres/')))

Before we split the audio files make empty directories for each genre

In [None]:
genres = 'blues classical country disco hiphop jazz metal pop reggae rock'
genres = genres.split()

for g in genres:
  path_audio = os.path.join('/kaggle/working/content/audio3sec',f'{g}')
  os.makedirs(path_audio)

Now we will make use of AudioSegment from pydub package to split our audio files.

In [None]:
from pydub import AudioSegment

i = 0
for g in genres:
    j=0
    for filename in os.listdir(f'{general_path}/genres/{g}'):
        song = f'{general_path}/genres/{g}/{filename}'

        j = j+1
        for w in range(0,10):
            i = i+1
            t1 = 3*(w)*1000
            t2 = 3*(w+1)*1000
            newAudio = AudioSegment.from_file(song, 'au')
            new = newAudio[t1:t2]
            new.export(f'/kaggle/working/content/audio3sec/{g}/{g+str(j)+str(w)}.wav', format="wav") 

create header to save the feature extraction results

In [None]:
header = 'filename chroma_stft rmse spectral_centroid spectral_bandwidth rolloff zero_crossing_rate'
for i in range(1, 21):
    header += f' mfcc{i}'
header += ' label'
header = header.split()

Now we will use librosa to generate feature extraction for the audio files.

In [None]:
file = open('data-3s.csv', 'w', newline='')
with file:
    writer = csv.writer(file)
    writer.writerow(header)
    
for g in genres:
    for filename in os.listdir(f'/kaggle/working/content/audio3sec/{g}'):
        songname = f'/kaggle/working/content/audio3sec/{g}/{filename}'
        y, sr = librosa.load(songname, mono=True, duration=3)
        chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
        rmse = librosa.feature.rms(y=y)
        spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
        spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
        rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
        zcr = librosa.feature.zero_crossing_rate(y)
        mfcc = librosa.feature.mfcc(y=y, sr=sr)
        to_append = f'{filename} {np.mean(chroma_stft)} {np.mean(rmse)} {np.mean(spec_cent)} {np.mean(spec_bw)} {np.mean(rolloff)} {np.mean(zcr)}'    
        for e in mfcc:
            to_append += f' {np.mean(e)}'
        to_append += f' {g}'
        file = open('data-3s.csv', 'a', newline='')
        with file:
            writer = csv.writer(file)
            writer.writerow(to_append.split())        

## Data Preparation

In [None]:
data_3s = pd.read_csv('data-3s.csv')
data_3s.head()

Show columns and rows in the feature extraction data

In [None]:
data_3s.shape

Create download link for the feature extraction data

In [None]:
def create_download_link(df, title = "Download CSV file", filename = "data-features.csv"):  
    csv = df.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

create_download_link(data_3s)

In [None]:
seed = 12
np.random.seed(seed)

data_shuffle = data_3s.sample(frac=1, random_state=seed).reset_index(drop=True)

Delete column with name filename

In [None]:
data_shuffle = data_shuffle.drop(['filename'],axis=1)
data_shuffle.head()

## Set Independent and Dependent Values

In [None]:
X = data_shuffle.iloc[:, :-1]

In [None]:
genre_list = data_shuffle.iloc[:, -1]
encoder = LabelEncoder()

y = encoder.fit_transform(genre_list)
print(y)

## Splitting the dataset into the Training set and Test set

In [None]:
X_train, df_test_valid_X, y_train, df_test_valid_y = skms.train_test_split(X, y, train_size=0.7, random_state=seed, stratify=y)
X_dev, X_test, y_dev, y_test = skms.train_test_split(df_test_valid_X, df_test_valid_y, train_size=0.66, random_state=seed, stratify=df_test_valid_y)

In [None]:
print(f"Train set has {X_train.shape[0]} records out of {len(data_shuffle)} which is {round(X_train.shape[0]/len(data_shuffle)*100)}%")
print(f"Dev set has {X_dev.shape[0]} records out of {len(data_shuffle)} which is {round(X_dev.shape[0]/len(data_shuffle)*100)}%")
print(f"Test set has {X_test.shape[0]} records out of {len(data_shuffle)} which is {round(X_test.shape[0]/len(data_shuffle)*100)}%")

## Feature Scaling

In [None]:
import sklearn.preprocessing as skp

scaler = skp.StandardScaler()

X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_dev = pd.DataFrame(scaler.transform(X_dev), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_train.columns)

## Model Building

In [None]:
import tensorflow as tf
print("TF version:-", tf.__version__)
import keras as k
tf.random.set_seed(seed)

In [None]:
ACCURACY_THRESHOLD = 0.94

class myCallback(k.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if(logs.get('val_accuracy') > ACCURACY_THRESHOLD):
            print("\n\nStopping training as we have reached %2.2f%% accuracy!" %(ACCURACY_THRESHOLD*100))   
            self.model.stop_training = True

def trainModel(model, epochs, optimizer):
    batch_size = 128
    callback = myCallback()
    model.compile(optimizer=optimizer,
                  loss='sparse_categorical_crossentropy',
                  metrics='accuracy'
    )
    return model.fit(X_train, y_train, validation_data=(X_dev, y_dev), epochs=epochs, 
                     batch_size=batch_size, callbacks=[callback])

def plotHistory(history):
    print("Max. Validation Accuracy",max(history.history["val_accuracy"]))
    pd.DataFrame(history.history).plot(figsize=(12,6))
    plt.show()

In [None]:
model_1 = k.models.Sequential([
    k.layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    k.layers.Dense(128, activation='relu'),
    k.layers.Dense(64, activation='relu'),
    k.layers.Dense(10, activation='softmax'),
])
print(model_1.summary())
model_1_history = trainModel(model=model_1, epochs=70, optimizer='adam')

In [None]:
plotHistory(model_1_history)

In [None]:
model_2 = k.models.Sequential([
    k.layers.Dense(512, activation='relu', input_shape=(X_train.shape[1],)),
    k.layers.Dropout(0.2),
    
    k.layers.Dense(256, activation='relu'),
    k.layers.Dropout(0.2),

    k.layers.Dense(128, activation='relu'),
    k.layers.Dropout(0.2),

    k.layers.Dense(64, activation='relu'),
    k.layers.Dropout(0.2),

    k.layers.Dense(10, activation='softmax'),
])
print(model_2.summary())
model_2_history = trainModel(model=model_2, epochs=100, optimizer='adam')

In [None]:
plotHistory(model_2_history)

In [None]:
model_3 = k.models.Sequential([
    k.layers.Dense(512, activation='relu', input_shape=(X_train.shape[1],)),
    k.layers.Dropout(0.2),
    
    k.layers.Dense(256, activation='relu'),
    k.layers.Dropout(0.2),

    k.layers.Dense(128, activation='relu'),
    k.layers.Dropout(0.2),

    k.layers.Dense(64, activation='relu'),
    k.layers.Dropout(0.2),

    k.layers.Dense(10, activation='softmax'),
])
print(model_3.summary())
model_3_history = trainModel(model=model_3, epochs=700, optimizer='sgd')

In [None]:
plotHistory(model_3_history)

In [None]:
model_4 = k.models.Sequential([
    k.layers.Dense(1024, activation='relu', input_shape=(X_train.shape[1],)),
    k.layers.Dropout(0.3),
    
    k.layers.Dense(512, activation='relu'),
    k.layers.Dropout(0.3),

    k.layers.Dense(256, activation='relu'),
    k.layers.Dropout(0.3),

    k.layers.Dense(128, activation='relu'),
    k.layers.Dropout(0.3),

    k.layers.Dense(64, activation='relu'),
    k.layers.Dropout(0.3),

    k.layers.Dense(10, activation='softmax'),
])
print(model_4.summary())
model_4_history = trainModel(model=model_4, epochs=500, optimizer='rmsprop')

In [None]:
plotHistory(model_4_history)

## Predict the Test set results

In [None]:
test_loss, test_acc  = model_4.evaluate(X_test, y_test, batch_size=128)
print("The test Loss is :",test_loss)
print("\nThe Best test Accuracy is :",test_acc*100)

In [None]:
predictions = model_4.predict(X_test)
np.argmax(predictions[0])

## Genre prediction on youtube videos 

In [None]:
pip install youtube-dl ffmpeg-python pydub

In [None]:
from __future__ import unicode_literals
import youtube_dl
import ffmpeg

ydl_opts = {
    'format': 'bestaudio/best',
    'outtmpl': 'output.%(ext)s',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'wav',
    }],
}

with youtube_dl.YoutubeDL(ydl_opts) as ydl:
    url=input("Enter Youtube URL: ")
    ydl.download([url])
    stream = ffmpeg.input('output.m4a')
    stream = ffmpeg.output(stream, 'output.wav')

In [None]:
from pydub import AudioSegment

t1 = 60000 #Works in milliseconds
t2 = 90000

waveFile = AudioSegment.from_file("output.wav")
waveFile = waveFile[t1:t2]
waveFile.export('output_30s.wav', format="wav")

In [None]:
def fit_feature(songname):
    y, sr = librosa.load(songname, mono=True, duration=30)
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    rmse = librosa.feature.rms(y=y)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    feature = np.array([np.mean(chroma_stft), np.mean(rmse), np.mean(spec_cent), np.mean(spec_bw), np.mean(rolloff), np.mean(zcr)])   
    for e in mfcc:
        feature = np.append(feature, [np.mean(e)])
    return feature

In [None]:
file_path = "output_30s.wav"
feature = fit_feature(file_path)
y = model_4.predict(scaler.transform([feature]))
ind = np.argmax(y)
genres[ind]