Generative music to facial landmarking

 - code adapted from https://www.kaggle.com/code/ohseokkim/music-generation-let-s-enjoy-new-music/notebook
 - https://git.arts.ac.uk/lmccallum/STEM-4-Creatives-22-23/blob/main/STEM-Week-5-Task-Solutions.ipynb
- audio classification code adapted from https://git.arts.ac.uk/tbroad/AI-4-Media-23-24/blob/main/Week-6a-Audio-classification/01-train-audio-classifier-solution.ipynb

In [2]:
!pip install librosa

Collecting librosa
  Downloading librosa-0.10.2.post1-py3-none-any.whl (260 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m260.1/260.1 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting audioread>=2.1.9
  Downloading audioread-3.0.1-py3-none-any.whl (23 kB)
Collecting pooch>=1.1
  Downloading pooch-1.8.2-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.6/64.6 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
Collecting soxr>=0.3.2
  Downloading soxr-0.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (252 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m252.9/252.9 kB[0m [31m56.8 MB/s[0m eta [36m0:00:00[0m
Collecting numba>=0.51.0
  Downloading numba-0.60.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m86.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m
Collecting lazy-loader>=0.1
  Down

In [11]:
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.utils import to_categorical

# Set dataset path
dataset_path = '/notebooks/Music dataset'

# Initialize lists for audio features and labels
audio_features = []
emotion_labels = []

# Debugging: Track files without labels
unlabeled_files = []

# Load audio files and extract features
for file in os.listdir(dataset_path):
    if file.endswith('.wav'):  # Adjust the file extension as needed
        file_path = os.path.join(dataset_path, file)
        try:
            y, sr = librosa.load(file_path, sr=None)  # Load audio file
            # Extract MFCC features
            mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
            mfccs = np.mean(mfccs.T, axis=0)
            audio_features.append(mfccs)
            
            # Label extraction based on filename conventions
            if 'neutral' in file:
                emotion_labels.append(0)  # Neutral
            elif 'sad' in file:
                emotion_labels.append(1)  # Sad
            elif 'happy' in file:
                emotion_labels.append(2)  # Happy
            elif 'angry' in file:
                emotion_labels.append(3)  # Angry
            else:
                unlabeled_files.append(file)  # Track files with no matching label
                
        except Exception as e:
            print(f"Error loading file {file}: {e}")  # Catch loading errors

# Convert lists to numpy arrays
audio_features = np.array(audio_features)
emotion_labels = np.array(emotion_labels)

# Print details of files without labels
if unlabeled_files:
    print(f"Files without labels: {unlabeled_files}")

# Check if the lengths of features and labels match
print(f"Number of features: {len(audio_features)}, Number of labels: {len(emotion_labels)}")  # Debugging step

# If there's a mismatch, exit early and check the data
if len(audio_features) != len(emotion_labels):
    raise ValueError("The number of features and labels do not match. Please check the data.")

# Standardize the audio features
scaler = StandardScaler()
audio_features = scaler.fit_transform(audio_features)

# Convert labels to categorical (one-hot encoding)
emotion_labels_categorical = to_categorical(emotion_labels, num_classes=4)

# Build LSTM model
model = Sequential([
    LSTM(128, input_shape=(audio_features.shape[1], 1), return_sequences=True),
    Dropout(0.3),
    LSTM(128, return_sequences=False),
    Dropout(0.3),
    Dense(4, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Reshape audio features to fit LSTM input requirements
audio_features_reshaped = audio_features[..., np.newaxis]

# Validate shapes before training
print(f"Shape of input features: {audio_features_reshaped.shape}, Shape of labels: {emotion_labels_categorical.shape}")  # Debugging step

# Fit the model (for demonstration purposes; replace with real training process)
model.fit(audio_features_reshaped, emotion_labels_categorical, epochs=10, batch_size=32)

# Generate random music snippet (placeholder for real music generation)
generated_snippet = np.random.normal(size=(1, audio_features.shape[1]))
generated_snippet_scaled = scaler.transform(generated_snippet)  # Scale the snippet
generated_snippet_reshaped = generated_snippet_scaled[..., np.newaxis]

# Predict emotion of the generated snippet
predicted_emotion = np.argmax(model.predict(generated_snippet_reshaped), axis=-1)

# Reduce dimensions with PCA for 2D visualization
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(audio_features)

# Plotting the classified snippets
plt.figure(figsize=(8, 8))
colors = ['blue', 'red', 'green', 'orange']
labels_map = {0: 'Neutral', 1: 'Sad', 2: 'Happy', 3: 'Angry'}

# Plot each emotion class in different colors
for i in np.unique(emotion_labels):
    plt.scatter(reduced_data[emotion_labels == i, 0], 
                reduced_data[emotion_labels == i, 1], 
                label=labels_map[i], c=colors[i])

plt.axhline(0, color='black', linewidth=0.5)
plt.axvline(0, color='black', linewidth=0.5)
plt.grid(True, linestyle='--', alpha=0.7)
plt.title('2D Emotion Plane of Music Snippets')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend()

# Highlight the position of the generated snippet
generated_point = pca.transform(generated_snippet_scaled)
plt.scatter(generated_point[:, 0], generated_point[:, 1], c='black', marker='x', s=100, label='Generated Snippet')
plt.legend()
plt.show()


Files without labels: ['Super_Mario_64_SelectaFile.wav', 'Final_Fantasy_7_VincentsTheme.wav', 'Legend_Of_Zelda_The_Majoras_Mask_AlienInvasion.wav', 'Legend_Of_Zelda_The_Ocarina_Of_Time_InsideJabuJabusBelly.wav', 'Final_Fantasy_7_CostaDelSol.wav', 'Goldeneye_Runway.wav', 'Shadow_of_the_Colossus_TheTaleofthe16Sacrifices.wav', 'Shadow_of_the_Colossus_TheOpenedWay.wav', 'BanjoKazooie_GobisValley.wav', 'Legend_Of_Zelda_The_Ocarina_Of_Time_Ganondorf.wav', 'BanjoKazooie_BoggysIglooSad.wav', 'Final_Fantasy_7_JenovaAbsolute.wav', 'Final_Fantasy_7_AHighwindtakestotheSkies.wav', 'Final_Fantasy_7_LurkingInTheDarkness.wav', 'Legend_Of_Zelda_The_Ocarina_Of_Time_Introduction.wav', 'Super_Mario_World_TitleScreen.wav', 'Legend_Of_Zelda_The_Ocarina_of_Time_TempleofTime.wav', 'Legend_Of_Zelda_The_Majoras_Mask_DekuPalace.wav', 'Goldeneye_Caverns.wav', 'Final_Fantasy_7_JudgementDay.wav', 'Final_Fantasy_7_GoldSaucer.wav', 'Legend_Of_Zelda_The_Ocarina_Of_Time_SariasSong.wav', 'Final_Fantasy_7_CidsTheme.wav',

ValueError: The number of features and labels do not match. Please check the data.