In [14]:
import pandas as pd
import muspy
import os
extract_dir = "./EMOPIA_1.0"
label_path = os.path.join(extract_dir, "label.csv")
midi_dir = os.path.join(extract_dir, "midis")


df = pd.read_csv(label_path)

# Step 3: Load MIDI + event tokens
data = []
for _, row in df.iterrows():
    midi_file = row['ID'].strip() + ".mid"
    emotion = int(row['4Q'])
    midi_path = os.path.join(midi_dir, midi_file)
    
    if not os.path.exists(midi_path):
        continue

    try:
        score = muspy.read_midi(midi_path)
        tokens = muspy.to_event_representation(score, encode_velocity=True)
        data.append({
            "emotion": emotion,
            "tokens": tokens
        })
        if len(tokens) == 0: print("bad file")
    except Exception as e:
        print(f"Skipping {midi_file}: {e}")

In [5]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.2, stratify=[d['emotion'] for d in data], random_state=42)


In [6]:
from torch.utils.data import Dataset, DataLoader
import torch

class EmopiaDataset(Dataset):
    def __init__(self, data, max_len=512):
        self.data = data
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        tokens = item["tokens"][:self.max_len - 1]
        x = [item["emotion"]] + tokens[:-1]
        y = tokens
        return torch.tensor(x), torch.tensor(y)

train_loader = DataLoader(EmopiaDataset(train_data), batch_size=8, shuffle=True)
test_loader = DataLoader(EmopiaDataset(test_data), batch_size=8)


In [7]:
import torch.nn as nn

class EmotionMusicModel(nn.Module):
    def __init__(self, vocab_size, d_model=256, nhead=4, num_layers=4):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
        self.transformer = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model, nhead), num_layers
        )
        self.pos_enc = nn.Parameter(torch.randn(512, d_model))
        self.out = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embed(x) + self.pos_enc[:x.size(1)]
        x = x.transpose(0, 1)
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(x.size(0)).to(x.device)
        x = self.transformer(x, x, tgt_mask=tgt_mask)
        x = x.transpose(0, 1)
        return self.out(x)


In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"

vocab_size = max(max(d['tokens']) for d in data) + 1
model = EmotionMusicModel(vocab_size=vocab_size)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

for epoch in range(10):
    model.train()
    total_loss = 0
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        pred = model(x)
        loss = criterion(pred.view(-1, pred.size(-1)), y.view(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch}: loss={total_loss / len(train_loader):.4f}")

TypeError: empty() received an invalid combination of arguments - got (tuple, dtype=NoneType, device=NoneType), but expected one of:
 * (tuple of ints size, *, tuple of names names, torch.memory_format memory_format, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)
 * (tuple of ints size, *, torch.memory_format memory_format, Tensor out, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)


In [None]:
def generate(emotion_token, model, max_len=300):
    model.eval()
    tokens = [emotion_token]
    with torch.no_grad():
        for _ in range(max_len):
            inp = torch.tensor(tokens).unsqueeze(0).to(device)
            out = model(inp)
            next_token = out[0, -1].argmax().item()
        