In [1]:
#!pip install librosa

In [2]:
import numpy as np
import pickle
import sys
import os
import librosa
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
SOUND_SAMPLE_LENGTH = 3000000

HAMMING_SIZE = 100
HAMMING_STRIDE = 40
def preprocessingAudio(audioPath):
    print ('Prepossessing ' + audioPath)

   
    y, sr = librosa.load(audioPath)

    # Let's make and display a mel-scaled power (energy-squared) spectrogram
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)

    # Convert to log scale (dB). We'll use the peak power as reference.
    log_S = librosa.power_to_db(S, ref=np.max)

    mfcc = librosa.feature.mfcc(S=log_S, sr=sr, n_mfcc=13)
    # featuresArray.append(mfcc)

    
    spectrogram_db = librosa.power_to_db(S, ref=np.max)
#     plt.figure(figsize=(10, 6))
#     librosa.display.specshow(spectrogram_db, y_axis='mel', x_axis='time', sr=sr, hop_length=512)
#     plt.colorbar(format='%+2.0f dB')
#     plt.title('Mel Spectrogram')
#     plt.xlabel('Time')
#     plt.ylabel('Frequency')
#     plt.tight_layout()
#     plt.show()
    return spectrogram_db[:, :2500]


   

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
from torchvision.datasets import ImageFolder

In [5]:
class SpectrogramRegressor(nn.Module):
    def __init__(self):
        super(SpectrogramRegressor, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.fc_layers = nn.Sequential(
            nn.Linear(32 * (input_height // 4) * (input_width // 4), 128),
            nn.ReLU(),
            nn.Linear(128, 6)  # Regression output is a single value
        )

    def forward(self, x):
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)
        x = self.fc_layers(x)
        return x

In [6]:
class CustomSpectrogramDataset(Dataset):
    def __init__(self, images, values, transform=None):
        self.images = images
        self.values = values
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.images[idx]
        value = self.values[idx]

        if self.transform:
            image = self.transform(image)

        return image, value

In [7]:
# Hyperparameters
batch_size = 1
learning_rate = .001
input_height, input_width = (128,2500)  # Dimensions of your spectrogram images

# Create the model instance
model = SpectrogramRegressor()

# Define loss function and optimizer for regression
criterion = nn.MSELoss()  # Mean Squared Error loss for regression
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Load and preprocess dataset
transform = transforms.Compose([
    transforms.Resize((input_height, input_width)),
    transforms.ToTensor(),
    #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [8]:
filepath = "Excerpts7.csv"
df = pd.read_csv(filepath,header = 0)
print(df)
list_of_intonations = []
list_of_spectrograms = []
for soundfile in os.listdir(os.getcwd()):
    if soundfile[-3:] == "mp3":
        print(soundfile)
        file_prefix = soundfile[:soundfile.index(".")]
        print(file_prefix)
        index = df.index[df['Name'] == file_prefix].tolist()[0]
        intonation=df.iloc[index]['Intonation']
        dynamics=df.iloc[index]['Dynamics']
        note_accuracy=df.iloc[index]['Note_Accuracy']
        expressiveness=df.iloc[index]['Expressiveness']
        articulation=df.iloc[index]['Articulation']
        tone_quality=df.iloc[index]['Tone_Quality']
            
        print(intonation)
        list_of_intonations += [[intonation,dynamics,note_accuracy,expressiveness,articulation,tone_quality]]
        spectrogram = preprocessingAudio(soundfile)
        #print(spectrogram)
        #print(spectrogram.shape)
        list_of_spectrograms += [spectrogram]
stacked_arrays = np.stack(list_of_spectrograms)
print(stacked_arrays.shape)
list_of_intonations = np.array(list_of_intonations)
print(list_of_intonations.shape)
    #prepossessingAudio("Daphnis1.mp3")

         Name  Intonation  Dynamics  Note_Accuracy  Expressiveness  \
0    Daphnis1           8         7              4               7   
1    Daphnis2           9         8              6              10   
2    Daphnis3           5         3              5               2   
3    Daphnis4           6         3              5               8   
4    Daphnis5           7         6             10               7   
5    Daphnis6           9        10             10              10   
6    Daphnis7           8         7              6               8   
7    Daphnis8          10        10             10              10   
8    Daphnis9           9        10             10               8   
9   Daphnis10           8        10             10               7   
10  Daphnis11           4         3              5               2   
11  Daphnis12          10         9             10              10   
12  Daphnis13           7         5              8               4   
13  Daphnis14       

In [9]:
# Transform numpy arrays to tensors
images = torch.tensor(stacked_arrays, dtype=torch.float32)
values = torch.tensor(list_of_intonations, dtype=torch.float32)

# Create a custom dataset
transform = transforms.Compose([
    transforms.ToPILImage(),  # Convert tensor to PIL image
    transforms.Resize((input_height, input_width)),
    transforms.ToTensor(),
    #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

dataset = CustomSpectrogramDataset(images, values, transform=transform)
from torch.utils.data import random_split
dataset_size = len(dataset)
train_size = int(0.7 * dataset_size)  # 70% for training
val_size = int(0.15 * dataset_size)   # 15% for validation
test_size = dataset_size - train_size - val_size  # Remaining for testing
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [10]:
# Training loop
num_epochs = 20
min_val_loss = 1000000
for epoch in range(num_epochs):
    training_losses = []
    validation_losses = []
    for images, labels in train_dataloader:
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels.float())  # Convert labels to float for regression
        loss.backward()
        optimizer.step()
        training_losses += [loss.item()]
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {sum(training_losses)/len(training_losses):.4f}')
    for images,labels in val_dataloader:
        outputs = model(images)
        loss = criterion(outputs, labels.float())  # Convert labels to float for regression
        validation_losses += [loss.item()]
    avg_val_loss = sum(validation_losses)/len(validation_losses)
    if avg_val_loss < min_val_loss:
        min_val_loss = avg_val_loss
        torch.save(model.state_dict(), 'spectrogram_regressor.pth')
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Validated loss: {sum(validation_losses)/len(validation_losses):.4f}') 
# Save the trained model
#torch.save(model.state_dict(), 'spectrogram_regressor.pth')

Epoch [1/20], Loss: 38.5839
Epoch [1/20], Validated loss: 7.0685
Epoch [2/20], Loss: 6.2680
Epoch [2/20], Validated loss: 6.9386
Epoch [3/20], Loss: 4.8675
Epoch [3/20], Validated loss: 6.0951
Epoch [4/20], Loss: 5.6073
Epoch [4/20], Validated loss: 6.5643
Epoch [5/20], Loss: 4.5337
Epoch [5/20], Validated loss: 7.1646
Epoch [6/20], Loss: 4.8210
Epoch [6/20], Validated loss: 10.4092
Epoch [7/20], Loss: 3.3327
Epoch [7/20], Validated loss: 7.5238
Epoch [8/20], Loss: 1.6142
Epoch [8/20], Validated loss: 9.7218
Epoch [9/20], Loss: 1.3046
Epoch [9/20], Validated loss: 7.0423
Epoch [10/20], Loss: 1.1593
Epoch [10/20], Validated loss: 8.1491
Epoch [11/20], Loss: 0.8485
Epoch [11/20], Validated loss: 8.6935
Epoch [12/20], Loss: 0.4695
Epoch [12/20], Validated loss: 9.1516
Epoch [13/20], Loss: 0.5355
Epoch [13/20], Validated loss: 9.6553
Epoch [14/20], Loss: 0.2532
Epoch [14/20], Validated loss: 10.2735
Epoch [15/20], Loss: 0.3464
Epoch [15/20], Validated loss: 7.9627
Epoch [16/20], Loss: 0.40

In [11]:
intonation_error = []
dynamics_error = []
note_accuracy_error = []
express_error = []
articulation_error = []
tone_quality_error = []
for images, labels in test_dataloader:
        outputs = model(images)
        outputs_lists = outputs.tolist()
        labels_list = labels.tolist()
        intonation_error += [abs(outputs_lists[0][0]-labels_list[0][0])]
        dynamics_error = [abs(outputs_lists[0][1]-labels_list[0][1])]
        note_accuracy_error = [abs(outputs_lists[0][2]-labels_list[0][2])]
        express_error = [abs(outputs_lists[0][3]-labels_list[0][3])]
        articulation_error = [abs(outputs_lists[0][4]-labels_list[0][4])]
        tone_quality_error = [abs(outputs_lists[0][5]-labels_list[0][5])]
        print(outputs)
        print(labels)
avg_intonation_error = np.mean(intonation_error)
avg_dynamics_error = np.mean(dynamics_error)
avg_note_accuracy_error = np.mean(note_accuracy_error)
avg_express_error=np.mean(express_error)
avg_articulation_error=np.mean(articulation_error)
avg_tone_quality_error =np.mean(tone_quality_error)



tensor([[7.5589, 7.7046, 7.8953, 6.9671, 7.8390, 6.6785]],
       grad_fn=<AddmmBackward0>)
tensor([[10., 10., 10.,  9., 10., 10.]])
tensor([[7.3837, 7.5762, 7.9376, 6.6399, 7.7180, 6.5907]],
       grad_fn=<AddmmBackward0>)
tensor([[4., 3., 5., 2., 6., 2.]])
tensor([[7.1121, 7.4152, 7.9709, 6.8275, 7.8637, 6.4789]],
       grad_fn=<AddmmBackward0>)
tensor([[7., 7., 9., 8., 8., 8.]])
tensor([[7.3874, 7.5207, 8.0014, 6.6458, 7.7444, 6.5985]],
       grad_fn=<AddmmBackward0>)
tensor([[ 9., 10., 10., 10., 10., 10.]])
tensor([[7.4038, 7.6762, 8.0885, 6.8938, 8.0887, 6.8910]],
       grad_fn=<AddmmBackward0>)
tensor([[ 9., 10., 10.,  8., 10.,  6.]])


In [12]:
!pip install openai



In [13]:
import openai

openai.api_key = "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"
messages = [
 
    {"role": "system", "content": """You are a helpful and kind AI Assistant. I will be giving you 6 values. the first is intonation, the second is dynamics, the third is note accuracy, the fourth is expressiveness,
     the fifth is articulation, the sixth is tone quality. I am a flute performer playing the flute solo from rehersal number 176-179 in Daphnis et Chloe by Ravel. This numbers are evaluations of my playing out of 10. Give me feeback on how I can improve my performance of this piece in each category relative to the score I recieved. Please provide references to this composition in your feedback. Be very specific to the excerpt such as maintaining good pitch in the high register or being able to remain quiet even in the high register. If the intonation is low mention to watch the pitch on the high G# because many players tend to play it sharp. If the tone quality is below a 7 mention to keep a round ambechure and open throat, and mention to have a more spinning vibrato. """},
]
def chatbot(input):
    if input:
        messages.append({"role": "user", "content": input})
        chat = openai.ChatCompletion.create(
            model="gpt-3.5-turbo", messages=messages
        )
        reply = chat.choices[0].message.content
        messages.append({"role": "assistant", "content": reply})
        return reply

In [14]:
answer = chatbot(str(outputs_lists))

In [15]:
print(answer)

Intonation (7.40/10):
Your intonation is generally good, but there are a few areas where you can improve. In measures 177-178, there are several high G#s that need to be played with caution. Make sure to watch the pitch on these notes, as many players tend to play them sharp. Use your embouchure and air support to bring these notes in tune with the rest of the ensemble. Pay attention to the overall pitch tendencies in this section and make necessary adjustments to maintain good intonation.

Dynamics (7.68/10):
Your dynamics are well-controlled for the most part, but there are opportunities to bring out more contrast in this section. In measures 176-177, there is a piano marking, indicating a soft dynamic. Ensure that your sound remains controlled and delicate throughout these measures while still maintaining a clear tone. In measure 178, where the music begins to crescendo, make sure to gradually increase the volume while keeping the tone quality consistent. Pay attention to the marked