# Music Genre Classification

**Import libraries**

In [37]:
import matplotlib.pyplot as plt
import numpy as np
import glob
import os
import PIL
import tensorflow as tf
import cv2
from tensorflow.keras import models, layers
from tensorflow.keras.preprocessing import image_dataset_from_directory
from tensorflow.keras.models import Sequential
from tensorflow.keras.applications import EfficientNetB0, EfficientNetB3, MobileNetV2, InceptionV3, ResNet152V2
from tensorflow.keras.optimizers import Adam
import numpy as np
from sklearn.model_selection import train_test_split

### Get data

In [38]:
img_data = '../input/gtzan-dataset-music-genre-classification/Data/images_original/'

### Data Loader

In [39]:
BATCH_SIZE=8
TARGET_SIZE=224 # Based on keras pew-trained models
NUM_CLASSES=10

In [40]:
train_ds = image_dataset_from_directory(
  img_data,
  validation_split=0.2,
  subset="training",
  seed=99,
  image_size=(TARGET_SIZE, TARGET_SIZE),
  batch_size=BATCH_SIZE)

In [41]:
val_ds = image_dataset_from_directory(
  img_data,
  validation_split=0.2,
  subset="validation",
  seed=99,
  image_size=(TARGET_SIZE, TARGET_SIZE),
  batch_size=BATCH_SIZE)

In [42]:
class_names = train_ds.class_names
print(class_names)

In [43]:
#GTZAN Dataset tags
TAGS = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
TAGS_CODES = dict()
i = 0
for tag in TAGS:
    i+=1
    TAGS_CODES[tag] = i
# data = pd.DataFrame(columns=["img", "class"])
DATADIR = "../input/gtzan-dataset-music-genre-classification/Data/images_original"
X = []
Y = []
for tag in TAGS:
    path = os.path.join(DATADIR, tag)
    for img in os.listdir(path):
        img_array = cv2.imread(os.path.join(path, img))
        new_data = dict()
        X.append( img_array)
        Y.append( TAGS_CODES[tag])


# The dataset content is consists out:
1- genres original - A collection of 10 genres with 100 audio files each, all having a length of 30 seconds (the famous GTZAN dataset, the MNIST of sounds)                                                                                   
2- images original - A visual representation for each audio file. One way to classify data is through neural networks. Because NNs (like CNN, what we will be using today) usually take in some sort of image representation, the audio files were converted to Mel Spectrograms to make this possible.                                                                
We will use image original folder


In [44]:
plt.figure(figsize=(25, 25))
for i in range(8):
    ax = plt.subplot(4, 4, i + 1)
    plt.imshow(X[i].astype("uint8"))
    plt.axis("off")


## Callbacks and Helper Functions

In [45]:
model_save = tf.keras.callbacks.ModelCheckpoint('./best_weights.h5', 
                             save_best_only = True, 
                             save_weights_only = True,
                             monitor = 'val_loss', 
                             mode = 'min', verbose = 1)
early_stop = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', min_delta = 0.001, 
                           patience = 10, mode = 'min', verbose = 1,
                           restore_best_weights = True)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor = 'val_loss', factor = 0.3, 
                              patience = 2, min_delta = 0.001, 
                              mode = 'min', verbose = 1)

In [46]:
def plot_hist(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']

    loss = history.history['loss']
    val_loss = history.history['val_loss']

    plt.figure(figsize=(10, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(acc, label='Training Accuracy')
    plt.plot(val_acc, label='Validation Accuracy')
    plt.legend(loc='lower right')
    plt.title('Training and Validation Accuracy')
    plt.grid()

    plt.subplot(1, 2, 2)
    plt.plot(loss, label='Training Loss')
    plt.plot(val_loss, label='Validation Loss')
    plt.legend(loc='upper right')
    plt.title('Training and Validation Loss')
    plt.grid()
    plt.show()

Shuffle Data and Image  (discard)

In [47]:
# numpy.array([1.2, "abc"], dtype=float)
X =  np.array(X)
Y = np.array(Y)
X = X/255
X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.33, random_state=42)

## Modeling

In [48]:
model = Sequential([
  layers.experimental.preprocessing.Rescaling(1./255, input_shape=(TARGET_SIZE, TARGET_SIZE, 3)),
  layers.Conv2D(16, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(16, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(32, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Flatten(),
  layers.Dense(128, activation='relu'),
  layers.Dense(NUM_CLASSES)
])

In [49]:
model.compile(optimizer=Adam(lr = 0.001),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [50]:
model.summary()


In [51]:
epochs=15
history = model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=epochs,
  callbacks=[model_save, early_stop, reduce_lr],
  verbose=2
)

In [52]:
plot_hist(history)

### **We notice that the validation loss in first 3 epochs is improving. Val accuracy is about 50 %. With number of epochs increased val accuracy goes for 62 % . However, val_loss did not improve from 1.30901. call-back function that reduces learning rate is called. Best Val accuracy = 64 % while training accuracy near 100% (over fit)**

### **Dropout Is A soltion for Overfitting**

## CNN with Dropout

In [53]:
model = Sequential([
  layers.experimental.preprocessing.Rescaling(1./255, input_shape=(TARGET_SIZE, TARGET_SIZE, 3)),
  layers.Conv2D(16, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Dropout(0.4),
  layers.Conv2D(16, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Dropout(0.4),
  layers.Conv2D(32, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Dropout(0.4),
  layers.Flatten(),
  layers.Dense(128, activation='relu'),
  layers.Dense(NUM_CLASSES)
])

In [54]:
model.compile(optimizer=Adam(lr = 0.001),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [55]:
model.summary()

In [56]:
epochs = 15
history = model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=epochs,
  callbacks=[model_save, early_stop, reduce_lr],
  verbose=2
)

In [57]:
plot_hist(history)

### **Also after drop-out, the same problem of overfitting still exists. The accuracy increased a little bit in range 60 % after adding three droput layers. We need a new soltion for this since the problem appeared because of the small data set amount. So, Trransfer learning will be used to use a pre-trained model.**

### 1. ResNet152V2 took a lot of time before it completes its first epoch.
### 2. I tried both of InceptionV3 and MobileNetV2 in a separate notebook. I found that InceptionV3 is better than MobileNetV2.

# Transfer learning - InceptionV3

In [67]:
def create_model():
    conv_base = InceptionV3(include_top = False, weights = "../input/keras-pretrained-models/InceptionV3_NoTop_ImageNet.h5",
                               input_shape = (TARGET_SIZE, TARGET_SIZE, 3))
    model = conv_base.output
    model = layers.GlobalAveragePooling2D()(model)
    model = layers.Dense(NUM_CLASSES, activation = "softmax")(model)
    model = models.Model(conv_base.input, model)

    model.compile(optimizer = Adam(lr = 0.001),
                  loss = "sparse_categorical_crossentropy",
                  metrics = ["accuracy"])
    return model
INCmodel = create_model()
#INCmodel.summary()

In [68]:
epochs = 15
history = INCmodel.fit(
  train_ds,
  validation_data=val_ds,
  epochs=epochs,
  callbacks=[model_save, early_stop, reduce_lr],
  verbose=2
)

In [69]:
plot_hist(history)

### Validation Accuracy is around 73 %

# Transfer learning - EfficientNetB0

In [70]:
def create_model():
    conv_base = EfficientNetB0(include_top = False, weights = "../input/keras-pretrained-models/EfficientNetB0_NoTop_ImageNet.h5", drop_connect_rate=0.7,
                               input_shape = (TARGET_SIZE, TARGET_SIZE, 3))
    model = conv_base.output
    model = layers.GlobalAveragePooling2D()(model)
    model = layers.Dense(NUM_CLASSES, activation = "softmax")(model)
    model = models.Model(conv_base.input, model)

    model.compile(optimizer = Adam(lr = 0.001),
                  loss = "sparse_categorical_crossentropy",
                  metrics = ["accuracy"])
    return model
model = create_model()
#model.summary()

In [72]:
epochs = 15
history = model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=epochs,
  callbacks=[model_save, early_stop, reduce_lr],
  verbose=2
)

In [73]:
plot_hist(history)

# **Best Validation accuracy 80 %**

## EfficientNetB0 with these custumization is better in our case than InceptionV3 & MobilNetV2 models

## Transfer learning - EfficientNetB0 (the usual way of transfer learning)

In [78]:
def create_model():
    conv_base = EfficientNetB0(include_top = False, weights = "../input/keras-pretrained-models/EfficientNetB0_NoTop_ImageNet.h5", drop_connect_rate=0.6,
                               input_shape = (TARGET_SIZE, TARGET_SIZE, 3))
    # Freeze pre-trained layers
    conv_base.trainable = False
    
    # Re-build top layers
    model = conv_base.output
    model = layers.GlobalAveragePooling2D()(model)
    model = layers.BatchNormalization()(model)
    
    dropout_rate=0.3
    model = layers.Dropout(dropout_rate, name="top_dropout")(model)
    model = layers.Dense(NUM_CLASSES, activation = "softmax")(model)
    model = models.Model(conv_base.input, model)

    model.compile(optimizer = Adam(lr = 0.01),
                  loss = "sparse_categorical_crossentropy",
                  metrics = ["accuracy"])
    return model

model = create_model()
#model.summary()

In [80]:
epochs = 15
history = model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=epochs,
  callbacks=[model_save, early_stop, reduce_lr],
  verbose=2
)

In [81]:
plot_hist(history)

# UNFREEZE (FINE-Tuning)

**This strategy is based on not
only replace the classifier layer of the network, but also retrain part or the whole network. Through backpropagation
we can modify the weights of the pre-trained model to adapt
the model to the new data distribution. Sometimes its preferable to keep the first layers of the network fixed (or freezed)
to avoid overfitting, and only fine-tune the deeper part. This
is motivated because the lower layers of the networks capture generic features, that are similar to many tasks while the
higher layers contain features that are task and dataset oriented.**

In [82]:
weights_path = './last_finetune_weights.h5'
model.save_weights(weights_path)

### Unfreeze 100 layers

In [83]:
NUM_UNFREEZE_LAYERS = 100

last_model = tf.keras.models.clone_model(model)
last_model.load_weights(weights_path)

def unfreeze_model(model):
    # We unfreeze the top NUM_UNFREEZE_LAYERS layers while leaving BatchNorm layers frozen
    for layer in model.layers[-NUM_UNFREEZE_LAYERS:]:
        if not isinstance(layer, layers.BatchNormalization):
            layer.trainable = True

    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
    model.compile(
        optimizer=optimizer, loss="sparse_categorical_crossentropy", 
        metrics=["accuracy"]
    )


unfreeze_model(last_model)
#cont_model.summary()

In [85]:
epochs = 30
history = model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=epochs,
  callbacks=[model_save, early_stop, reduce_lr],
  verbose=2
)

In [86]:
plot_hist(history)

### Validation Accuracy is around 73 %

### Unfreeze ALL layers

In [87]:
all_model = tf.keras.models.clone_model(model)
all_model.load_weights(weights_path)

def fix_all(model):
    # We unfreeze the whole layers while leaving BatchNorm layers frozen
    for layer in model.layers:
        if not isinstance(layer, layers.BatchNormalization):
            layer.trainable = True

    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
    model.compile(
        optimizer=optimizer, loss="sparse_categorical_crossentropy", 
        metrics=["accuracy"]
    )


fix_all(all_model)
#fix_all.summary()

In [89]:
epochs = 15  # @param {type: "slider", min:8, max:50}
history = all_model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=epochs,
  callbacks=[model_save, early_stop, reduce_lr],
  verbose=2
)

In [90]:
plot_hist(history)

**--> Range of 76% accuracy is achieved after Unfreeze all of the layers**

**--> Although there is an increasing in validation accuracy,the problem of overfitting still appears as a result of lack training data.**

**--> Ordinary data augmentation may not be feasible for song data like GTZAN, because:**
*      Cannot use typical transformations like rotation, zoom, flipping because spectrogram would be non-sense
*      Cannot use audio transformation because this will distort the original song.**

### Useful Links
- https://www.tensorflow.org/tutorials/images/classification
- https://keras.io/examples/vision/image_classification_efficientnet_fine_tuning/