## **Survey: Evaluating AI-Generated Music Based on Emotions**  

1. **What mood would you like to hear in the music?** *(You can select multiple options)*  
   - 🎵 Joyful  
   - 🕊️ Calm  
   - 🔥 Energetic  
   - 😢 Sad  
   - 💭 Dreamy  
   - 😡 Aggressive  
   - Other (please specify) ______________  

2. **In what context would you listen to this music?**  
   - 🚶‍♂️ Walking  
   - 🎧 Background for work  
   - 🏃‍♀️ Sports / Activity  
   - 💤 Relaxation / Meditation  
   - 🎉 Party  
   - Other (please specify) ______________  

3. **Do you have any preferred music style?**  
   - Electronic  
   - Classical  
   - Rock  
   - Jazz  
   - Light instrumental  
   - No preference  
   - Other (please specify) ______________  

4. **Which instruments do you like?** *(You can select multiple options)*  
   - 🎸 Guitar  
   - 🎹 Piano  
   - 🎻 Violin / Strings  
   - 🥁 Drums  
   - 🎺 Brass / Wind instruments  
   - Synthesizers / Electronic sounds  
   - No preference  

---  


In [1]:
import os

def list_files(startpath):
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print('{}{}/'.format(indent, os.path.basename(root)))
        subindent = ' ' * 4 * (level + 1)
        for f in files:
            print('{}{}'.format(subindent, f))


startpath = '.'
list_files(startpath)

./
    archive (1).zip
    processed_deam_data.csv
    project.ipynb
    project_DRUGOY.ipynb
    simple_model.pth
    DEAM/
        DEAM_Annotations/
            annotations/
                annotations averaged per song/
                    dynamic (per second annotations)/
                        arousal.csv
                        valence.csv
                    song_level/
                        static_annotations_averaged_songs_1_2000.csv
                        static_annotations_averaged_songs_2000_2058.csv
                annotations per each rater/
                    dynamic (per second annotations)/
                        arousal/
                            10.csv
                            1000.csv
                            1001.csv
                            1002.csv
                            1003.csv
                            1004.csv
                            1005.csv
                            1006.csv
                            1007.csv
                

In [None]:
import os
from pydub import AudioSegment

# Paths to the directories with processed (trimmed) and normalized audio files
AUDIO_DIR = "DEAM/DEAM_audio/processed/"
OUTPUT_DIR = "DEAM/DEAM_audio/processed_normalized/"  # Directory to store normalized files

# Create the output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Get the list of all audio files in the processed folder
audio_files = [f for f in os.listdir(AUDIO_DIR) if f.endswith(".mp3")]

# Function to normalize audio files
def normalize_audio(file_path, output_path):
    # Load the audio file
    audio = AudioSegment.from_mp3(file_path)
    
    # Normalize the volume of the audio
    normalized_audio = audio.normalize()
    
    # Export the normalized audio to the specified output path
    normalized_audio.export(output_path, format="mp3")
    
    # Print a message confirming the normalization
    print(f"Normalized and saved file: {output_path}")

# Apply normalization to each processed audio file
for file in audio_files:
    file_path = os.path.join(AUDIO_DIR, file)
    output_path = os.path.join(OUTPUT_DIR, file)
    normalize_audio(file_path, output_path)

# Print a message when normalization is complete
print("Normalization of all files is complete!")


In [None]:
# Function to normalize audio files
def normalize_audio(file_path, output_path):
    # Load the audio file
    audio = AudioSegment.from_mp3(file_path)
    
    # Normalize the volume of the audio
    normalized_audio = audio.normalize()
    
    # Export the normalized audio to the specified output path
    normalized_audio.export(output_path, format="mp3")
    
    # Print a message confirming the normalization
    print(f"Normalized and saved file: {output_path}")

# Apply normalization to each trimmed audio file
for file in audio_files:
    file_path = os.path.join(AUDIO_DIR, file)
    output_path = os.path.join(OUTPUT_DIR, file)
    normalize_audio(file_path, output_path)

# Print a message when normalization is complete
print("Normalization of all files is complete!")


In [48]:
import torch
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModel.from_pretrained("bert-base-uncased")

def encode_survey_response(survey_response):
    """
    Encode the survey responses using BERT to obtain embeddings.
    :param survey_response: Either a string, a list of strings, or a tensor containing token indices.
    :return: Tensor representation of the user's answers.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    bert_model.to(device)
    
    if isinstance(survey_response, torch.Tensor):  
        survey_response = survey_response.tolist()
    
    if isinstance(survey_response, list) and all(isinstance(x, int) for x in survey_response):
        survey_response = tokenizer.decode(survey_response)
    
    input_text = survey_response if isinstance(survey_response, str) else " ".join(map(str, survey_response))
    
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(device)
    
    with torch.no_grad():
        outputs = bert_model(**inputs)
    
    return outputs.last_hidden_state[:, 0, :].cpu()


In [38]:
import pandas as pd
import torchaudio

def load_audio_features(file_path):
    """
    Load audio features (arousal, valence) from CSV
    :param file_path: Path to the CSV file containing audio features.
    :return: A DataFrame of audio features.
    """
    return pd.read_csv(file_path)

def extract_audio_features(audio_file_path):
    """
    Extract audio features from an audio file (e.g., MFCC, Mel Spectrogram, etc.)
    :param audio_file_path: Path to the audio file.
    :return: Tensor of audio features.
    """
    waveform, sample_rate = torchaudio.load(audio_file_path)
    
    # Extract Mel Spectrogram as example feature
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(sample_rate)(waveform)
    return mel_spectrogram


In [39]:
import torch.nn as nn
import torch.optim as optim

class MusicGenerator(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MusicGenerator, self).__init__()
        
        # Define the layers of the model
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.transformer = nn.Transformer(d_model=hidden_size, nhead=8, num_encoder_layers=6)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, user_embedding, audio_features):
        # Combine user embedding and audio features
        x = torch.cat((user_embedding, audio_features), dim=1)
        
        # Pass through the network
        x = torch.relu(self.fc1(x))
        x = self.transformer(x, x)
        output = self.fc2(x)
        return output


In [40]:
import torch
import torchaudio
import torch.nn.functional as F

def extract_audio_features(audio_file_path):
    """
    Extracting audio features (for example, Mel Spectrogram) from an audio file.
    :param audio_file_path: Path to the audio file.
    :return: Tensor of audio features.
    """
    waveform, sample_rate = torchaudio.load(audio_file_path)
    
    # Extracting Mel Spectrogram as an example of features
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(sample_rate)(waveform)
    
    # Normalize the Mel Spectrogram
    mel_spectrogram = F.pad(mel_spectrogram, (0, 1), mode='constant', value=0)  # Padding for the same length
    return mel_spectrogram


In [41]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd

class MusicSurveyDataset(Dataset):
    def __init__(self, survey_responses, audio_files):
        """
        Custom Dataset for loading survey responses and corresponding audio features.
        :param survey_responses: List of survey responses (strings or lists of strings).
        :param audio_files: A list of paths to audio files.
        """
        self.survey_responses = survey_responses
        self.audio_files = audio_files

    def __len__(self):
        return len(self.survey_responses)

    def __getitem__(self, idx):
        # Get the survey responses
        survey_response = self.survey_responses[idx]
        
        # Load audio features from CSV (e.g., arousal, valence, etc.)
        audio_features = extract_audio_features(self.audio_files[idx])
        
        # Get BERT embeddings for the survey response
        user_embedding = encode_survey_response(survey_response)
        
        return user_embedding, audio_features


In [42]:
# Example survey responses and corresponding audio feature CSV paths
survey_responses = ["Energetic", "Calm", "Sad"]  

audio_files = [
    "DEAM/DEAM_audio/processed_normalized/10.mp3", 
    "DEAM/DEAM_audio/processed_normalized/1000.mp3", 
    "DEAM/DEAM_audio/processed_normalized/1001.mp3"
]

# Create the dataset and DataLoader
dataset = MusicSurveyDataset(survey_responses, audio_files)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)



In [None]:
#For now I have some problems with training model, but I'm working on this
def train(model, data_loader, optimizer, criterion, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        for user_response, audio_features in data_loader:
            # Get user embedding from survey response
            user_embedding = encode_survey_response(user_response)
            
            # Pass through the model
            generated_features = model(user_embedding, audio_features)
            
            # Compute loss
            loss = criterion(generated_features, audio_features)
            
            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        print(f"Epoch {epoch+1}, Loss: {loss.item()}")

# Model, optimizer, and criterion
input_size = 768  # Size of BERT embedding (can be different based on the configuration)
hidden_size = 512
output_size = 256  # Example output size for audio features
model = MusicGenerator(input_size, hidden_size, output_size)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

# Train the model (assuming you have a data_loader)
train(model, data_loader, optimizer, criterion)


In [None]:
def generate_music(model, survey_response):
    model.eval()
    
    # Encode user input
    user_embedding = encode_survey_response(survey_response)
    
    # Generate music features
    generated_features = model(user_embedding, torch.zeros_like(user_embedding))  # Assuming no initial audio features
    
    # Convert the generated features into audio waveform (e.g., by inverse transformation)
    # This is a placeholder as it depends on how you process features to generate audio
    generated_audio = features_to_audio(generated_features)
    return generated_audio


Future work:
Once we have generated the audio features, we’ll need to convert them back into an audio signal (like a waveform). This is a non-trivial task and requires a method to map the features back into an audio signal, such as using WaveNet or another generative model designed for audio synthesis.