# Emotion Conditioned Music Generation
This notebook provides the code for implementing a Transformer-GAN for the dissertation. The objective of the model is to produce sentimental music given an input emotion

## ADDING EMOTION LOSS

## TODOs

1. Implement CP word transformer

2. Add emotions to discriminators

3. Change loss functions

4. Run more epochs

5. Use BERT embeddings

6. TSNE visualisation

7. Padding with last token instead of 0

## Importing libraries

In [1]:
# !pip install music21 miditoolkit miditok

In [None]:
# %pip install --user torch==1.7.0 torchvision==0.8.1 -f https://download.pytorch.org/whl/cu102/torch_stable.html

In [1]:
%pip install torch torchvision

Note: you may need to restart the kernel to use updated packages.


In [1]:
import torch
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

In [1]:
import numpy as np 
import pandas as pd 
from io import open
import tensorflow as tf
import glob
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import time
from miditok import get_midi_programs, REMI, CPWord
from miditoolkit import MidiFile
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split

In [2]:
torch.__version__

'1.7.0'

In [6]:
device = 'cpu'

In [4]:
torch.cuda.empty_cache()

In [5]:
torch.cuda.is_available()

True

In [6]:
# Seed
# seed = 22
# torch.manual_seed(seed)
# torch.cuda.manual_seed(seed)
# torch.cuda.manual_seed_all(seed)
# np.random.seed(seed)
# torch.backends.cudnn.benchmark = False
# torch.backends.cudnn.deterministic = True

## Loading the Dataset

In [7]:
# how a midi file looks like
midi = MidiFile('archive/EMOPIA_1.0 (1)/EMOPIA_1.0/midis/Q1__8v0MFBZoco_0.mid')
midi

ticks per beat: 384
max tick: 46051
tempo changes: 1
time sig: 1
key sig: 0
markers: 0
lyrics: False
instruments: 1

In [8]:
# for now, we will only be using for piano right since it determines the melody
midi.instruments

[Instrument(program=0, is_drum=False, name="")]

In [9]:
# file path to the MIDI files
files_paths = list(glob.glob('archive/EMOPIA_1.0 (1)/EMOPIA_1.0/midis/*.mid'))
# reading labels
labels_df = pd.read_csv('archive/EMOPIA_1.0 (1)/EMOPIA_1.0/label.csv')
labels_df = list(labels_df['4Q'])

In [10]:
import muspy

In [11]:
tempos = []
for file in files_paths:
    # print(file)
    music = muspy.read_midi(file)
    print(music.tempos)
    tempos.append(music.tempos[0].qpm)

[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[Tempo(time=0, qpm=120.0)]
[

In [12]:
torch.unique(torch.Tensor(tempos))

tensor([120.])

In [13]:
additional_tokens = {'Chord': True, 'Rest': True, 'Tempo': True, 'Program': False,
                     'rest_range': (2, 8),  # (half, 8 beats)
                     'nb_tempos': 32,  # nb of tempo bins
                     'tempo_range': (40, 250),
                     'TimeSignature':None}  # (min, max)

In [14]:
# encoder = CPWord()

In [15]:
# create a list of notes
# this stores the REMI encoded tokens of the midi files

def load_files(files_paths, encoder = REMI(additional_tokens)):
    assert len(files_paths) > 0
    notes = []


    for file in files_paths:
        # file_name = os.path.basename(file)

        # read the MIDI file
        midi = MidiFile(file)

        # Converts MIDI to tokens
        tokens = encoder.midi_to_tokens(midi)
        
        # The EMOPIA dataset has midi files with only one instrument, i.e. the piano 
        # hence we just add those tokens
        # print(tokens)
        notes.append(tokens[0])

    return notes, encoder

In [16]:
notes, cp_enc = load_files(files_paths, CPWord(additional_tokens = additional_tokens))

In [17]:
print("There are",len(cp_enc.vocab),"unique tokens in the files")

There are 285 unique tokens in the files


Adding emotions as an extra type of family, this will help with notes with just the emotion

In [18]:
# Create a dataset corpus from the notes and labels

class REMICorpus(object):
    def __init__(self, notes, labels, encoder, seq_length, split_size = 0.2):
        self.encoder = encoder
        self.seq_len = seq_length

        # ntrain, ntest, ltrain, ltest = train_test_split(notes, labels, test_size=split_size, random_state=42, shuffle=True, stratify=labels)
 
        self.xtrain, self.xtrainencoded, self.raw_to_enc, self.enc_to_raw = self.tokenize(notes, labels)
        # self.xvalid = self.tokenize(ntest, ltest)
    
    def __len__(self):
        return len(self.encoder.vocab)
    
    def tokenize(self, notes, labels):
        assert len(notes) > 0
        assert len(labels) > 0

        # create a set of notes
        # they should all be padded to have sequence of len seq_len
        songss = []
        labelss = []

        for song, label in zip(notes, labels):
            song = torch.tensor(song).type(torch.int64)
            songs = list(song.split(self.seq_len))

            for i in range(len(songs)):
                # removing sequences that have < seq len/4 tokens
                if len(songs[i]) < self.seq_len/4:
                    del songs[i]
                    continue
                labelss.append(label-1)
            songss.extend(songs)
        
        # padding songs to be of same length
        songs = pad_sequence(songss)

        corpus = []

        # adding emotion values to the sequences
        for song, label in zip(songs.view(songs.size(1), songs.size(0), songs.size(2)), labelss):
            l = torch.full((self.seq_len,1), label)
            inp = torch.cat([song, l], dim=-1)
            corpus.append(inp)

        corpus = torch.stack(corpus)

        # creates the range of each type of token
        # for eg. family is [0, 2, 3]
        token_ranges = [corpus[:,:,i].squeeze().unique() for i in range(8)]
        
        # creates a reverse dictionary for each token
        # for eg. family is {0: 0, 2: 1, 3: 2}
        token_dicts = [dict(zip(tokens.tolist(), range(len(tokens)))) for tokens in token_ranges]

        new_corpus = corpus.clone().detach()
        for i in range(len(corpus)):
            for k in range(8):
                new_corpus[i,:,k] = torch.tensor([token_dicts[k][l.item()] for l in corpus[i,:,k]])
            

        # converting all the tokens in each type to new values:


        return corpus, new_corpus, token_ranges, token_dicts

In [19]:
corpus = REMICorpus(notes, labels_df, cp_enc, 101, split_size=0.01)

In [20]:
raw_data = corpus.xtrain.to(device)
train_data = corpus.xtrainencoded.to(device)

tokens_to_raw = corpus.raw_to_enc
raw_to_tokens = corpus.enc_to_raw

# train_emo = corpus.ytrain.to(device)
# val_emo = corpus.yvalid.to(device)

print("X train data shape:", train_data.shape)
# print("emo train data shape:", train_emo.shape)
print("X valid data shape:", raw_data.shape)
# print("emo valid data shape:", val_emo.shape)

X train data shape: torch.Size([4823, 101, 9])
X valid data shape: torch.Size([4823, 101, 9])


In [21]:
print("There are total",len(notes), "songs and a total of", train_data.shape[0], "sequences extracted")

There are total 1078 songs and a total of 4823 sequences extracted


In [22]:
print("There are",len(corpus), "unique tokens")

There are 285 unique tokens


In [23]:
train_data[77,:,1]

tensor([ 2, 27,  2,  2, 11,  2,  9, 20,  2,  7,  2,  2, 32,  2,  2, 14,  2,  7,
         3,  2, 10,  2, 27,  1, 11, 34, 19,  9,  2, 31,  8,  4,  2,  2, 17,  4,
         2,  2,  2,  8,  2, 26,  6, 12, 11,  2,  8,  2, 26, 20,  2,  2,  3, 20,
         3,  2,  2,  3,  2,  2,  3,  5, 26,  9,  1, 12,  2,  2,  5, 20,  2,  5,
         2,  2,  2,  8,  2, 18,  2,  4,  2,  9, 21,  6,  2,  2,  2,  4,  7, 19,
        12,  2,  2, 11,  2,  4,  2,  2, 10,  2,  4])

In [24]:
ntokens = []
for i in range(9):
    # getting the unique values in the type
    # and the maximum value
    # we dont use length because for eg. the first type is : [0,2,3] so len is 3 but these range from 0-3 (4)
    ntokens.append(len(train_data[:,:,i].squeeze().unique()))

In [25]:
ntokens

[3, 35, 86, 32, 66, 16, 11, 3, 4]

## Model Building

### Constants

In [26]:
# NEW for every type of token: corpus and emotion
# ntokens = [len(corpus), 4]

emsize = 512
nhead = 4


nhid = 512
nlayer = 8
dropout = 0.2
# Loop over epochs.
lr = 0.0001
best_val_loss = None
epochs = 2
save = './model.pt'
criterion = nn.CrossEntropyLoss()
device = device


In [27]:
emsize

512

### Position Encoding

In [28]:
# adapted from the pytorch positional encoding class
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # PE is the Positional Encoding matrix 
        # THIS STORES THE POSITIONS OF THE SEQUENCE
        pe = torch.zeros(max_len, d_model)

        # Arange - RETURNS A RANGE BETWEEN VALUES, HERE IT IS 0 - max_len
        # unsqueeze - adds a dimension, 1 means that each element in the first list is now in a list
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        # division term, here it is (10000 ** ((2 * i)/d_model))
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        # calculating the position encoding for the even and odd terms        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        # Unsqueeze 0 will put PE in one list
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # make embeddings relatively larger
        # This is so we do not lose the importance of the embedding
        # we add the embedding to the PE 
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [29]:
class Generator(nn.Module):
    """Container module with an encoder, a recurrent or transformer module, and a decoder."""

    def __init__(self, ntoken, d_model, nhead, nlayers, dropout=0.5, max_length = 2048, device = device):
        super(Generator, self).__init__()
        try:
            from torch.nn import TransformerEncoder, TransformerEncoderLayer
        except:
            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')

        # original mask
        self.src_mask = None
        self.max_length = max_length
        self.d_model = d_model
        self.nlayers = nlayers
        self.ntokens = ntoken

        self.device = device

        # NEW criterion and embedding size
        self.criterion = nn.CrossEntropyLoss(reduction='none')
        # CHANGED: using embedding size and reshaping vector
        self.embed_siz = [32, 64, 128, 128, 512, 128, 128, 128, 128]

        # embedding encoding
        self.embedding_family  = nn.Embedding(self.ntokens[0], self.embed_siz[0])
        self.embedding_bar  = nn.Embedding(self.ntokens[1], self.embed_siz[1])
        self.embedding_pitch  = nn.Embedding(self.ntokens[2], self.embed_siz[2])
        self.embedding_velocity  = nn.Embedding(self.ntokens[3], self.embed_siz[3])
        self.embedding_duration  = nn.Embedding(self.ntokens[4], self.embed_siz[4])
        self.embedding_chord  = nn.Embedding(self.ntokens[5], self.embed_siz[5])
        self.embedding_rest  = nn.Embedding(self.ntokens[6], self.embed_siz[6])
        self.embedding_tempo  = nn.Embedding(self.ntokens[7], self.embed_siz[7])
        self.embedding_emotion   = nn.Embedding(self.ntokens[8], self.embed_siz[8])

        self.in_linear = nn.Linear(np.sum(self.embed_siz), self.d_model)
        
        # positional encoding
        self.pos_encoder = PositionalEncoding(d_model, dropout)

        # in linear layer
        # CHANGED: using this to convert one hot encoding of emotions batch * 5 -> linear transformation of emotions batch * 
        # TODO
        self.linear = nn.Linear(np.sum(self.embed_siz), self.d_model)
        
        # encoder
        encoder_layer = TransformerEncoderLayer(d_model = d_model, nhead = nhead, dropout = dropout)
        self.encoder = TransformerEncoder(encoder_layer, nlayers)

        # output layers
        self.project_family = nn.Linear(d_model, ntoken[0])
        self.project_bar = nn.Linear(d_model, ntoken[1])
        self.project_pitch = nn.Linear(d_model, ntoken[2])
        self.project_velocity = nn.Linear(d_model, ntoken[3])
        self.project_duration = nn.Linear(d_model, ntoken[4])
        self.project_chord = nn.Linear(d_model, ntoken[5])
        self.project_rest = nn.Linear(d_model, ntoken[6])
        self.project_tempo = nn.Linear(d_model, ntoken[7])
        self.project_emo = nn.Linear(d_model, ntoken[8])

        # size is the d model plus the type
        self.proj_cat = nn.Linear(d_model + self.embed_siz[0], d_model)
        
        
        self.init_weights()
    
    def compute_loss(self, predict, target):
        loss = self.criterion(predict, target)
        return torch.sum(loss)
            

    def _generate_square_subsequent_mask(self, sz):
        mask = torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
        return mask

    def init_weights(self):
        initrange = 0.1

        nn.init.uniform_(self.embedding_family.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_bar.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_pitch.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_velocity.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_duration.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_chord.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_rest.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_tempo.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_emotion.weight, -initrange, initrange)
     
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)
        self.project_family.bias.data.zero_()
        self.project_family.weight.data.uniform_(-initrange, initrange)
        self.project_bar.bias.data.zero_()
        self.project_bar.weight.data.uniform_(-initrange, initrange)
        self.project_pitch.bias.data.zero_()
        self.project_pitch.weight.data.uniform_(-initrange, initrange)
        self.project_velocity.bias.data.zero_()
        self.project_velocity.weight.data.uniform_(-initrange, initrange)
        self.project_duration.bias.data.zero_()
        self.project_duration.weight.data.uniform_(-initrange, initrange)
        self.project_chord.bias.data.zero_()
        self.project_chord.weight.data.uniform_(-initrange, initrange)
        self.project_rest.bias.data.zero_()
        self.project_rest.weight.data.uniform_(-initrange, initrange)
        self.project_tempo.bias.data.zero_()
        self.project_tempo.weight.data.uniform_(-initrange, initrange)
        self.project_emo.bias.data.zero_()
        self.project_emo.weight.data.uniform_(-initrange, initrange)

    def forward(self, x_family, x_bar, x_pitch, x_velocity, x_duration, x_chord ,x_rest, x_tempo, x_emo, src_mask):
        # creating embedding for all tokens and emotions
        x_family = self.embedding_family(x_family)
        x_bar = self.embedding_bar(x_bar)
        x_pitch = self.embedding_pitch(x_pitch)
        x_velocity = self.embedding_velocity(x_velocity)
        x_duration = self.embedding_duration(x_duration)
        x_chord = self.embedding_chord(x_chord)
        x_rest = self.embedding_rest(x_rest)
        x_tempo = self.embedding_tempo(x_tempo)
        # print(x_emo.shape)
        # print(x_emo)
        x_emo = self.embedding_emotion(x_emo)

        # normalising the input for the position encoding
        x_family = x_family * math.sqrt(self.d_model)
        x_bar = x_bar * math.sqrt(self.d_model)
        x_pitch = x_pitch * math.sqrt(self.d_model)
        x_velocity = x_velocity * math.sqrt(self.d_model)
        x_duration = x_duration * math.sqrt(self.d_model)
        x_chord = x_chord * math.sqrt(self.d_model)
        x_rest = x_rest * math.sqrt(self.d_model)
        x_tempo = x_tempo * math.sqrt(self.d_model)
        x_emo = x_emo * math.sqrt(self.d_model)

        # concatenating as one input
        x = torch.cat([x_family, x_bar, x_pitch, x_velocity, x_duration, x_chord, x_rest, x_tempo, x_emo], dim=-1)

        # sending through linear layer
        x = self.in_linear(x)

        x = self.pos_encoder(x)

        # print(x.shape)
        # print(x.view(x.size(1), x.size(0), x.size(2)).shape)
        # print(x)
        # print()

        if src_mask == None:
            src_mask = self._generate_square_subsequent_mask(x.size(1)).to(self.device)
            
        self.src_mask = src_mask

        output = self.encoder(x.view(x.size(1), x.size(0), x.size(2)), self.src_mask)

        y_family = self.project_family(output)

        type_prob = F.softmax(y_family, dim=-1)
        # y_type = torch.multinomial(type_prob[-1].squeeze().exp(), 1)

        # print('y type shape before', y_type.shape)
        n,s,t = type_prob.shape
        y_type = torch.multinomial(type_prob.view(-1, t), 1, replacement=True).view(n, s)
        # print('y type shape after', y_type.shape)

        tf_skip_family = self.embedding_family(y_type)

        # print(output.view(output.size(1), output.size(0), output.size(2)).shape, tf_skip_family.shape)
        y_concat_family = torch.cat([output, tf_skip_family], dim=-1)
        y_ = self.proj_cat(y_concat_family)

        y_bar = self.project_bar(y_)
        y_pitch = self.project_pitch(y_)
        y_velocity = self.project_velocity(y_)
        y_duration = self.project_duration(y_)
        y_chord = self.project_chord(y_)
        y_rest = self.project_rest(y_)
        y_tempo = self.project_tempo(y_)
        y_emo = self.project_emo(y_)

        outputs = [F.log_softmax(y_family, dim=-1), F.log_softmax(y_bar, dim=-1), F.log_softmax(y_pitch, dim=-1), F.log_softmax(y_velocity, dim=-1), F.log_softmax(y_duration, dim=-1), F.log_softmax(y_chord, dim=-1), F.log_softmax(y_rest, dim=-1), F.log_softmax(y_tempo, dim=-1)]

        # y_notes = F.log_softmax(y_notes, dim=-1)
        # y_emo = F.log_softmax(y_emo, dim=-1)

        return outputs, F.log_softmax(y_emo, dim=-1)

In [31]:
class Discriminator(nn.Module):
    """
    Discriminator based on a pytorch TransformerEncoder.
    """
    
    def __init__(self, ntokens, d_model, nhead, nhid, nlayers, dropout=0.5, max_length = 2048):
        super(Discriminator, self).__init__()
        
        try:
            from torch.nn import TransformerEncoder, TransformerEncoderLayer, TransformerDecoderLayer, TransformerDecoder
        except:
            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')

        self.d_model = d_model

        # default embedding sizes:
        self.embed_siz = [32, 64, 128, 128, 512, 128, 128, 128, 128]
        self.ntokens = ntokens
        # embedding encoding
        # print(self.ntokens[0], self.embed_siz[0])
        self.embedding_family  = nn.Embedding(self.ntokens[0], self.embed_siz[0])
        self.embedding_bar  = nn.Embedding(self.ntokens[1], self.embed_siz[1])
        self.embedding_pitch  = nn.Embedding(self.ntokens[2], self.embed_siz[2])
        self.embedding_velocity  = nn.Embedding(self.ntokens[3], self.embed_siz[3])
        self.embedding_duration  = nn.Embedding(self.ntokens[4], self.embed_siz[4])
        self.embedding_chord  = nn.Embedding(self.ntokens[5], self.embed_siz[5])
        self.embedding_rest  = nn.Embedding(self.ntokens[6], self.embed_siz[6])
        self.embedding_tempo  = nn.Embedding(self.ntokens[7], self.embed_siz[7])
        self.embedding_emotion   = nn.Embedding(self.ntokens[8], self.embed_siz[8])
        
        # linear layer for converting the extra dimension to a linear vector
        self.linear = nn.Linear(np.sum(self.embed_siz), self.d_model)
        
        # encoding positional information using position encoder
        # with default drop out of 0.2
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        
        # encoding layers
        encoder_layers = TransformerEncoderLayer(d_model, nhead, nhid, dropout)
        self.encoder = TransformerEncoder(encoder_layers, nlayers)
        
        # final classification layer
        self.classifier = nn.Linear(d_model, 2)

        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.embedding_family.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_bar.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_pitch.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_velocity.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_duration.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_chord.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_rest.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_tempo.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_emotion.weight, -initrange, initrange)
     
     
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)
        self.classifier.bias.data.zero_()
        self.classifier.weight.data.uniform_(-initrange, initrange)
        # self.project_emo.bias.data.zero_()
        # self.project_emo.weight.data.uniform_(-initrange, initrange)
    def forward(self, x_family, x_bar, x_pitch, x_velocity, x_duration, x_chord ,x_rest, x_tempo, x_emo):
        # creating embedding for all tokens and emotions
        print(x_family.shape)
        
        x_family = self.embedding_family(x_family)
        # print(x_family)
        x_bar = self.embedding_bar(x_bar)
        x_pitch = self.embedding_pitch(x_pitch)
        x_velocity = self.embedding_velocity(x_velocity)
        x_duration = self.embedding_duration(x_duration)
        x_chord = self.embedding_chord(x_chord)
        x_rest = self.embedding_rest(x_rest)
        x_tempo = self.embedding_tempo(x_tempo)
        x_emo = self.embedding_emotion(x_emo)

        # normalising the input for the position encoding
        x_family = x_family * math.sqrt(self.d_model)
        x_bar = x_bar * math.sqrt(self.d_model)
        x_pitch = x_pitch * math.sqrt(self.d_model)
        x_velocity = x_velocity * math.sqrt(self.d_model)
        x_duration = x_duration * math.sqrt(self.d_model)
        x_chord = x_chord * math.sqrt(self.d_model)
        x_rest = x_rest * math.sqrt(self.d_model)
        x_tempo = x_tempo * math.sqrt(self.d_model)
        x_emo = x_emo * math.sqrt(self.d_model)

        # concatenating as one input
        x = torch.cat([x_family, x_bar, x_pitch, x_velocity, x_duration, x_chord, x_rest, x_tempo, x_emo], dim=-1)
        # print(x.shape, "disc shape for linear")

        # sending through linear layer
        x = self.linear(x)

        # encoding positions
        x = self.pos_encoder(x)

        # sending through transformer encoder
        x = self.encoder(x)
        
        # classification
        x = x.mean(dim=1)
        x = self.classifier(x)
        return nn.Sigmoid()(x)
        

In [32]:
dictionary = cp_enc.vocab.token_to_event

In [33]:
class MidiTransGAN(nn.Module):
    def __init__(self, generator, discriminator, noise_fn, data_fn,
                 batch_size=2, device='cuda', lr_d=0.0001, lr_g=0.0001):
        """A GAN class for holding and training a generator and discriminator
        Args:
            generator: a Ganerator network
            discriminator: A Discriminator network
            noise_fn: function f(num: int) -> pytorch tensor, (latent vectors)
            data_fn: function f(num: int) -> pytorch tensor, (real samples)
            batch_size: training batch size
            device: cpu or CUDA
            lr_d: learning rate for the discriminator
            lr_g: learning rate for the generator
        """
        super(MidiTransGAN, self).__init__()
        self.generator = generator.to(device)
        # self.generator = self.generator.to(device)
        self.discriminator = discriminator.to(device)
        # self.discriminator = self.discriminator.to(device)
        self.noise_fn = noise_fn
        self.data_fn = data_fn
        self.batch_size = batch_size
        self.device = device
        self.criterion = nn.CrossEntropyLoss(reduction='mean')
        self.optim_d = torch.optim.SGD(discriminator.parameters(),
                                  lr=lr_d,)
        self.optim_g = torch.optim.SGD(generator.parameters(),
                                  lr=lr_g)
        self.target_ones = torch.ones((1, 218)).to(device)
        self.target_zeros = torch.zeros((256, 218)).to(device)
        self.seq_len = 100
        # self.src_mask = self.src_mask = torch.triu(torch.ones(511, 511) * float('-inf'), diagonal=1).to(device)

    def compute_accuracy(self, predicted_weights, target):
        predicted = predicted_weights.argmax(dim=1)
        return torch.sum(predicted == target) / len(target)

    def calc_gradient_penalty(self, real, fake, LAMBDA=10):
        temp_notes = torch.rand([real.shape[0], 1]).to(device)
        # expand into the shape
        temp_notes = temp_notes.expand(real[:,:,0].size())

        # interpolation
        mid = temp_notes * real[:,:,0] + ((1 - temp_notes) * fake[:,:,0])

        mid = mid.type(torch.LongTensor)
        mid = mid.type(torch.FloatTensor).to(device)

        mid = torch.autograd.Variable(mid, requires_grad=True)
        # print(mid.shape)
    
        mid = torch.einsum(
            "ve,bn -> bne",
            self.discriminator.embedding_notes.weight,
            mid,
        )

        # print(mid.type(torch.LongTensor))
        classification = self.discriminator(emb_note = mid, x_emo = real[:,:,1].to(device))
        

        gradients = torch.autograd.grad(outputs=classification, inputs=mid,
                                        grad_outputs=torch.ones(classification.size(), device=device),
                                        create_graph=True, retain_graph=True, allow_unused = True)[0]
        # print(gradients)
        gradients = gradients.view(real.shape[0], -1)

        # https://github.com/igul222/improved_wgan_training/blob/master/gan_language.py
        slopes = torch.sqrt(torch.sum(gradients ** 2, dim=1) + 1e-12)
        gradient_penalty = ((slopes - 1.) ** 2).mean() * LAMBDA

        return gradient_penalty

    def weighted_sampling(self, probs):
        probs /= sum(probs)
        sorted_probs = np.sort(probs)[::-1]
        sorted_index = np.argsort(probs)[::-1]
        word = np.random.choice(sorted_index, size=1, p=sorted_probs)[0]
        return word

    def sampling(self, logits, p=None, t=1.0):
        logits = logits[-1].squeeze().cpu().numpy()
        probs = np.exp(logits / t) / np.sum(np.exp(logits / t))
        # print(probs)
        cur_word = self.weighted_sampling(probs)
        return cur_word

    def generate_samples(self, latent_vec=None, emotion=None, num=None, src_mask = None, display = False):
        """Sample from the generator.
        Args:
            latent_vec: A pytorch latent vector or None
            num: The number of samples to generate if latent_vec is None
        If latent_vec and num are None then use self.batch_size random latent
        vectors.
        """
        num = self.batch_size if num is None else num
        latent_vec = self.noise_fn(self.seq_len,1, emotion) if latent_vec is None else latent_vec

        last_metric = torch.LongTensor([1, len(tokens_to_raw[0]), len(tokens_to_raw[0]), len(tokens_to_raw[0]), len(tokens_to_raw[0]), len(tokens_to_raw[5]), len(tokens_to_raw[6]), len(tokens_to_raw[7]), 0])
        last_note = torch.LongTensor([2, 3, 6, 2, 2, 2, 2, 2, 0])

        # 2 for note and 3 for metric

        if emotion == None:
            emotion = latent_vec[:,:,8][0][0]
        
        if src_mask == None:
            src_mask = generate_square_subsequent_mask((latent_vec.size(0))).to(device)
        # print(src_mask.shape)

        # since we are not training
        # we fix no gradients
        with torch.no_grad():
            # learning for 3 sequences at a time
            # this is purely due to resource constraints
            count = 0
            while(count <= num):

                # generating fake samples
                fake_samples, _ = generator(latent_vec[:,:,0], latent_vec[:,:,1], latent_vec[:,:,2], latent_vec[:,:,3], latent_vec[:,:,4], latent_vec[:,:,5], latent_vec[:,:,6], latent_vec[:,:,7], latent_vec[:,:,8], src_mask = None)
                

                cur_family =    self.sampling(fake_samples[0], t=1)
                cur_bar =  self.sampling(fake_samples[1], t=2)
                cur_pitch =    self.sampling(fake_samples[2], t=1)
                cur_velocity =    self.sampling(fake_samples[3], t=5)
                cur_duration = self.sampling(fake_samples[4], t=5)
                cur_chord =    self.sampling(fake_samples[5], t=2)
                cur_rest =    self.sampling(fake_samples[6], t=1)
                cur_tempo = self.sampling(fake_samples[7], t=1)

                cur_family_corrected = tokens_to_raw[0][cur_family.item()].item()
                cur_bar_corrected = tokens_to_raw[1][cur_bar.item()].item()
                cur_pitch_corrected = tokens_to_raw[2][cur_pitch.item()].item()
                cur_velocity_corrected = tokens_to_raw[3][cur_velocity.item()].item()
                cur_duration_corrected = tokens_to_raw[4][cur_duration.item()].item()
                cur_chord_corrected = tokens_to_raw[5][cur_chord.item()].item()
                cur_rest_corrected = tokens_to_raw[6][cur_rest.item()].item()
                cur_tempo_corrected = tokens_to_raw[7][cur_tempo.item()].item()

                good_token = False
                # if it is a note family
                if cur_family_corrected == 2:
                    # if this does not contain any ignores
                    # The ignore tokens are as follows:
                    
                    # 93: Veloctiy Ignore
                    # 126: Duration Ignore 
                    # 191: Position Ignore
                    
                    # if it does not contain any ignores, it is a perfect prediction, hence we can
                    # use this as the last note
                    together = [cur_pitch_corrected, cur_velocity_corrected, cur_duration_corrected]
                    if not (set([0,4,93,126]) & set(together)):
                        count += 1
                        good_token = True
                    # if cur_pitch_corrected != 4 and cur_velocity_corrected != 93 and cur_duration_corrected != 126 and cur_bar_corrected != 191:
                    #     last_note = torch.LongTensor([cur_family, cur_bar, cur_pitch, cur_velocity, cur_duration, cur_chord, cur_rest, cur_tempo, emotion])
                    # in case pitch is ignored for note, use the last one
                    # if cur_pitch_corrected == 4 or cur_pitch_corrected == 0:
                    #     cur_pitch = last_note[2]
                    # else: 
                    #     last_note[2] = cur_pitch
                    # if cur_velocity_corrected == 93 or cur_velocity_corrected == 0:
                    #     cur_velocity = last_note[3]
                    # else: 
                    #     last_note[3] = cur_velocity
                    # if cur_duration_corrected == 126 or cur_duration_corrected == 0:
                    #     cur_duration = last_note[4] 
                    # else: 
                    #     last_note[4] = cur_duration
                    
                elif cur_family_corrected == 3:
                    # if this does not contain any ignores
                    # The ignore tokens are as follows:
                    # 224: Chord Ignore
                    # 242: Rest Ignore
                    # 252: Tempo Ignore 
                    # 4: Pitch Ignore
                    # 1: Bar Ignore

                    
                    # if it does not contain any ignores, it is a perfect prediction, hence we can
                    # use this as the last note
                    # if cur_rest_corrected != 242 or cur_rest_corrected != 0:
                    #     cur_bar = 1
                    #     cur_bar_corrected = 4
                    #     count += 1
                    #     good_token = True
                    together = [cur_chord_corrected, cur_rest_corrected, cur_tempo_corrected, cur_bar_corrected]
                    if not (set([0, 252]) & set(together)):
                        # next_tokens = torch.LongTensor([cur_family, cur_bar, cur_pitch, cur_velocity, cur_duration, cur_chord, cur_rest, cur_tempo, emotion])
                        count += 1
                        good_token = True
                    # if cur_chord_corrected != 224 and cur_rest_corrected != 242 and cur_tempo_corrected != 252:
                    #     last_metric = torch.LongTensor([cur_family, cur_bar, cur_pitch, cur_velocity, cur_duration, cur_chord, cur_rest, cur_tempo, emotion])
                    # in case pitch is ignored for note, use the last one
                    # if cur_chord_corrected == 224 or cur_chord_corrected == 0:
                    #     cur_chord = last_metric[5]
                    # else:
                    #     last_metric[5] = cur_chord
                    # if cur_rest_corrected == 242 or cur_rest_corrected == 0:
                    #     cur_rest = last_metric[6]
                    # else:
                    #     last_metric[6] = cur_rest
                    # if cur_tempo_corrected == 252 or cur_tempo_corrected == 0:
                    #     cur_tempo = last_metric[7] 
                    # else:
                    #     last_metric[7] = cur_tempo
                    # if cur_bar_corrected == 191 or cur_bar_corrected == 0 or cur_bar_corrected == 1:
                    #     cur_bar = last_metric[1] 
                    # else:
                    #     last_metric[1] = cur_bar

                # next_tokens = torch.LongTensor([cur_family, cur_bar, cur_pitch, cur_velocity, cur_duration, cur_chord, cur_rest, cur_tempo, emotion])
                if good_token:
                    next_tokens = torch.LongTensor([cur_family, cur_bar, cur_pitch, cur_velocity, cur_duration, cur_chord, cur_rest, cur_tempo, emotion])
                    # cur_family_corrected = tokens_to_raw[0][cur_family.item()].item()
                    # cur_bar_corrected = tokens_to_raw[1][cur_bar.item()].item()
                    # cur_pitch_corrected = tokens_to_raw[2][cur_pitch.item()].item()
                    # cur_velocity_corrected = tokens_to_raw[3][cur_velocity.item()].item()
                    # cur_duration_corrected = tokens_to_raw[4][cur_duration.item()].item()
                    # cur_chord_corrected = tokens_to_raw[5][cur_chord.item()].item()
                    # cur_rest_corrected = tokens_to_raw[6][cur_rest.item()].item()
                    # cur_tempo_corrected = tokens_to_raw[7][cur_tempo.item()].item()
                    
                    if(display):
                        print('| ', dictionary[cur_family_corrected], dictionary[cur_bar_corrected], dictionary[cur_pitch_corrected], dictionary[cur_velocity_corrected], dictionary[cur_duration_corrected], dictionary[cur_chord_corrected], dictionary[cur_rest_corrected], dictionary[cur_tempo_corrected])
                    # concatenate vector to a fix length of seq len (here it is set as 4)
                    # shape -> seq_len * batch_size * 9
                    # print(latent_vec.shape, next_token.shape)
                    latent_vec = torch.cat([latent_vec, next_tokens.view(1,1,next_tokens.size(0)).to(device)], dim=1)
    
            
        # with torch.no_grad():
        #     samples = self.generator(latent_vec, emotion, src_mask = None)
        return latent_vec

    def train_step_generator(self):
        """Train the generator one step and return the loss."""
        self.generator.zero_grad()

        # latent_vec = self.noise_fn(10,self.batch_size)
        # latent_vec = latent_vec.to(device)

        # emotion = self.emotions[:,:self.batch_size].to(device)
        
        # generated samples
        # starting with a sequence of length 4
        latent_vec = self.noise_fn(20,self.batch_size)
        target = latent_vec[:,:,8].T[0]
        emotion = latent_vec[:,:,8][0][0]
        loss_emotions = 0
        acc_emotions = 0
        # since we are not traning generator
        # we fix no gradients

        # learning for 10 length sequences at a time
        # this is purely due to resource constraints
        for i in range(20):

            # generating fake samples
            fake_samples, out_emo = generator(latent_vec[:,:,0], latent_vec[:,:,1], latent_vec[:,:,2], latent_vec[:,:,3], latent_vec[:,:,4], latent_vec[:,:,5], latent_vec[:,:,6], latent_vec[:,:,7], latent_vec[:,:,8], src_mask = None)
            

            # getting the weights and converting them to notes
            emo_weights = out_emo.mean(dim=0)
            loss_emotion = self.criterion(emo_weights, target)
            acc_emotion = self.compute_accuracy(emo_weights, target)
            loss_emotions += loss_emotion
            acc_emotions += acc_emotion

            word_tensor = []
            for k, output in enumerate(fake_samples):
                # For Notes:
                # getting the weights and converting them to notes
                # print(output.shape)
                word_weights = output[-1].squeeze().exp().cpu()
                
            # for Emotions:
                # getting the values from the distribution from 218 (num of possible notes)
                word = torch.multinomial(word_weights, 1)
                # batch size * 1 -> 1 * batch_size
                # word_notes = word.view(1, word.size(0))
                # print(word_notes.shape)
                word_notes = word.view(word.size(0), 1)

                word_tensor.append(word_notes.to(device))
                # = torch.stack([word_notes, word_tensor], dim=-1)

            # emotions = torch.full((word_notes.size(0),1), emotion)
            word_tensor.append(target.view(target.size(0), 1).to(device))

            # stack the emotions to the final shape: seq_len * batch_size * 2 (1 for emotion and 1 for notes)
            # here seq_len = 1
            # word_tensor.append(emotions)
            word_tensor = torch.stack(word_tensor, dim=-1)
        
        
        classifications = self.discriminator(latent_vec[:,:,0].to(device), latent_vec[:,:,1].to(device), latent_vec[:,:,2].to(device), latent_vec[:,:,3].to(device), latent_vec[:,:,4].to(device), latent_vec[:,:,5].to(device), latent_vec[:,:,6].to(device), latent_vec[:,:,7].to(device), latent_vec[:,:,8].to(device))
 
        # loss for generator
        # loss_gen = self.criterion(classifications, torch.zeros((classifications.size(0)), dtype=torch.int64).to(device))
        loss_gen = -torch.mean(classifications)
        # print(out_emo)
        # loss for emotions
        loss_emotions = loss_emotions / 20
        acc_emotions = acc_emotions / 20
        # loss_emo = self.criterion(out_emo, torch.full((out_emo.size(0), out_emo.size(1)), emotion.item()))
        # loss = (loss_gen + loss_emotions) / 2
        loss = loss_gen
        # print(loss_gen)
        # loss_gen.retain_grad()
        loss.backward()
        self.optim_g.step()
        return loss.item(), acc_emotions

    def train_step_discriminator(self):
        """Train the discriminator one step and return the losses."""
        self.discriminator.zero_grad()

        # getting real samples
        # this is using the data_fn or the get batch function
        # here, the data is the sequence with shape batch_size * seq_len * num of tokens
        # in general that is 32 * 100 * 2
        # this batch is randomly sampled from the corpus
        # the target sequence is the same shape, and is the next step in the sequence
        real_samples, real_target = self.data_fn(train_data, self.batch_size)
        emotions = real_samples[:,:,8].T[0]
        real_samples = real_samples.to(device)
        # real_target = real_target.to(device)
        # print(real_samples[:,:,0])

        # the discrimiator
        # [:,:,0] -> notes
        # [:,:,1] -> emotion
        pred_real = self.discriminator(real_samples[:,:,0].to(device), real_samples[:,:,1].to(device), real_samples[:,:,2].to(device), real_samples[:,:,3].to(device), real_samples[:,:,4].to(device), real_samples[:,:,5].to(device), real_samples[:,:,6].to(device), real_samples[:,:,7].to(device), real_samples[:,:,8].to(device))
        
        # loss_real = self.criterion(pred_real, torch.ones(pred_real.size(0), dtype=torch.int64).to(device))

        # generated samples
        # starting with a sequence of length 4
        latent_vec = self.noise_fn(real_samples.size(1),self.batch_size, emotions)
        target = latent_vec[:,:,8].T[0]
        emotion = latent_vec[:,:,8][0][0]
        loss_emotions = 0
        acc_emotions = 0
        nll_loss = 0

        # since we are not traning generator
        # we fix no gradients
        with torch.no_grad():
            # learning for 3 sequences at a time
            # this is purely due to resource constraints
            for i in range(20):

                # generating fake samples
                # print(latent_vec[:,:,8])
                fake_samples, out_emo = generator(latent_vec[:,:,0], latent_vec[:,:,1], latent_vec[:,:,2], latent_vec[:,:,3], latent_vec[:,:,4], latent_vec[:,:,5], latent_vec[:,:,6], latent_vec[:,:,7], latent_vec[:,:,8], src_mask = None)
                
                # for Emotions:
                # getting the weights and converting them to notes
                emo_weights = out_emo.mean(dim=0)
                loss_emotion = self.criterion(emo_weights, target)
                acc_emotion = self.compute_accuracy(emo_weights, target)
                loss_emotions += loss_emotion
                acc_emotions += acc_emotion

                word_tensor = []
                for k, output in enumerate(fake_samples):
                    # For Notes:
                    # getting the weights and converting them to notes
                    # print(output[-1].shape)
                    word_weights = output[-1].squeeze().exp().cpu()
                    # getting the values from the distribution from 218 (num of possible notes)
                    if i == 0:
                        nll_loss += self.criterion(output.view(output.size(1), output.size(2), output.size(0)).cpu(), real_target[:,:,k].cpu())
                    word = torch.multinomial(word_weights, 1)
                    # batch size * 1 -> 1 * batch_size
                    # word_notes = word.view(1, word.size(0))
                    # print(word_notes.shape)
                    word_notes = word.view(word.size(0), 1)

                    word_tensor.append(word_notes.to(device))
                    # = torch.stack([word_notes, word_tensor], dim=-1)

                # emotions = torch.full((word_notes.size(0),1), emotion)
                # emotions.repeat(seq_len, 1).T.to(device)

                # stack the emotions to the final shape: seq_len * batch_size * 2 (1 for emotion and 1 for notes)
                # here seq_len = 1
                word_tensor.append(target.view(target.size(0), 1).to(device))
                word_tensor = torch.stack(word_tensor, dim=-1)

                if i == 0:
                    nll_loss_emotion = nll_loss + self.criterion(out_emo.view(out_emo.size(1), out_emo.size(2), out_emo.size(0)).cpu(), real_target[:,:,8].cpu())
                    
                
                
                # concatenate vector to a fix length of seq len (here it is set as 4)
                # shape -> seq_len * batch_size * 9
                # IMP: [;,1;,:] to keep input size fixed
                latent_vec = torch.cat([latent_vec[:,1:,:], word_tensor.to(device)], dim=1)
        nll_loss = nll_loss / 8
        nll_loss_emotion = nll_loss_emotion / 9
        # predict on the fake samples
        pred_fake = self.discriminator(latent_vec[:,:,0].to(device), latent_vec[:,:,1].to(device), latent_vec[:,:,2].to(device), latent_vec[:,:,3].to(device), latent_vec[:,:,4].to(device), latent_vec[:,:,5].to(device), latent_vec[:,:,6].to(device), latent_vec[:,:,7].to(device), latent_vec[:,:,8].to(device))
        # loss on fake
        # loss_fake = self.criterion(pred_fake, torch.zeros((pred_fake.size(0)), dtype=torch.int64).to(device))
        # loss on emotions
        loss_emotions = loss_emotions / 20
        acc_emotions = acc_emotions / 20
        # loss_emo = criterion(out_emo.cpu(), emotion.T[:,:5].cpu())

        loss_fake = torch.mean(pred_fake)
        loss_real = -torch.mean(pred_real)
        # combine
        loss = (loss_real + loss_fake + loss_emotions) 
        loss.backward()
        # print(out_emo)
        # print(emotion[:out_emo.size(0)])
        # loss_real.backward()
        # loss_fake.backward()
        # loss_emo.backward()
        self.optim_d.step()
        return loss_real.item(), loss_fake.item(), acc_emotions, nll_loss.item(), nll_loss_emotion.item()

    def train_step(self):
        """Train both networks and return the losses."""
        loss_d = self.train_step_discriminator()
        loss_g = self.train_step_generator()
        return loss_g, loss_d

In [34]:
# get_batch subdivides the source data into chunks of length args.bptt.
# If source is equal to the example output of the batchify function, with
# a bptt-limit of 2, we'd get the following two Variables for i = 0:
# ┌ a g m s ┐ ┌ b h n t ┐
# └ b h n t ┘ └ c i o u ┘
# Note that despite the name of the function, the subdivison of data is not
# done along the batch dimension (i.e. dimension 1), since that was handled
# by the batchify function. The chunks are along dimension 0, corresponding
# to the seq_len dimension in the LSTM.
def get_batch(source, batch_size):
    rand_columns = torch.randperm(source.size(0))[:batch_size]
    # batch_size = min(batch_size, len(source) - 1 - i)
    data = source[rand_columns,:source.size(1)-1, :]
    target = source[rand_columns,1:source.size(1), :]
    return data, target

In [35]:
def generate_square_subsequent_mask(sz):
    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

In [36]:
ntokens

[3, 35, 86, 32, 66, 16, 11, 3, 4]

In [37]:
def noise_fn(seq_len, batch_size, emotions=None):
    notes = []
    for token in ntokens[:-1]:
        # print(token)
        notes.append(torch.randint(1,token, (batch_size, seq_len), dtype=torch.long).to(device))
    
    if emotions != None:
        emotions = emotions.repeat(seq_len, 1).T.to(device)
    else:
        emotion = torch.randint(0,4, (1,), dtype=torch.long)
        emotions = torch.full((batch_size, seq_len), emotion.item()).to(device)
    
    notes.append(emotions)
    return torch.stack(notes, dim=-1)
    

In [40]:
batch_size = 8

In [41]:
generator = Generator(ntokens, emsize, nhead, nlayer, dropout)
discriminator = Discriminator(ntokens, emsize, nhead, nhid, nlayer, dropout)

gan = MidiTransGAN(generator, discriminator, noise_fn, get_batch, batch_size=batch_size, device=device)

## Training

In [42]:
import torch
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

In [43]:
from time import time

gan.train()

def train():
    epochs = 120
    batches = 10
    
    loss_gs, acc_gs, loss_d_reals, loss_d_fakes, acc_ds, nll_losses, emo_losses = [], [], [], [], [], [], []
    start = time()
    
    for epoch in range(epochs):
        total_loss_g, total_acc_g, total_loss_d_real, total_loss_d_fake, total_acc_d, total_nll, total_nll_emo  = 0, 0, 0, 0, 0, 0, 0

        for batch in range(batches):
            (loss_g, accuracy_g), (loss_d_real, loss_d_fake, accuracy_d, nll_loss, nll_loss_emo) = gan.train_step()
            total_loss_g += loss_g
            total_loss_d_real += loss_d_real
            total_loss_d_fake += loss_d_fake
            total_acc_g += accuracy_g
            total_acc_d += accuracy_d
            total_nll += nll_loss
            total_nll_emo += nll_loss_emo

        loss_gs.append(total_loss_g / batches)
        loss_d_reals.append(total_loss_d_real / batches)
        loss_d_fakes.append(total_loss_d_fake / batches)
        acc_gs.append(total_acc_g / batches)
        acc_ds.append(total_acc_d / batches)
        nll_losses.append(total_nll / batches)
        emo_losses.append(total_nll_emo / batches)

        writer.add_scalar("Generator Loss", loss_gs[-1], epoch)
        writer.add_scalar("Discriminator Loss (Real)", loss_d_reals[-1], epoch)
        writer.add_scalar("Discriminator Loss (Fake)", loss_d_fakes[-1], epoch)
        writer.add_scalar("Generator Accuracy", acc_gs[-1], epoch)
        writer.add_scalar("Discriminator Accuracy", acc_ds[-1], epoch)
        writer.add_scalar("NLL", nll_losses[-1], epoch)
        writer.add_scalar("NLL (Emo)", emo_losses[-1], epoch)
        
        print(f"Epoch {epoch+1}/{epochs} ({int(time() - start)}s):"
              f" Gen Loss: {loss_gs[-1]:.3f},"
              f" Dis Loss (Real): {loss_d_reals[-1]:.3f},"
              f" Dis Loss (Fake): {loss_d_fakes[-1]:.3f}",
              f" Gen Accuracy: {acc_gs[-1]:.3f}",
              f" Dis Accuracy: {acc_ds[-1]:.3f}",
              f" NLL: {nll_losses[-1]:.3f}",
              f" NLL (Emo): {emo_losses[-1]:.3f}")
train()

torch.Size([8, 100])


KeyboardInterrupt: 

In [42]:
# Print model's state_dict
print("Model's state_dict:")
for param_tensor in gan.state_dict():
    print(param_tensor, "\t", gan.state_dict()[param_tensor].size())

Model's state_dict:
generator.embedding_family.weight 	 torch.Size([3, 32])
generator.embedding_bar.weight 	 torch.Size([35, 64])
generator.embedding_pitch.weight 	 torch.Size([86, 128])
generator.embedding_velocity.weight 	 torch.Size([32, 128])
generator.embedding_duration.weight 	 torch.Size([66, 512])
generator.embedding_chord.weight 	 torch.Size([16, 128])
generator.embedding_rest.weight 	 torch.Size([11, 128])
generator.embedding_tempo.weight 	 torch.Size([3, 128])
generator.embedding_emotion.weight 	 torch.Size([4, 128])
generator.in_linear.weight 	 torch.Size([512, 1376])
generator.in_linear.bias 	 torch.Size([512])
generator.pos_encoder.pe 	 torch.Size([5000, 1, 512])
generator.linear.weight 	 torch.Size([512, 1376])
generator.linear.bias 	 torch.Size([512])
generator.encoder.layers.0.self_attn.in_proj_weight 	 torch.Size([1536, 512])
generator.encoder.layers.0.self_attn.in_proj_bias 	 torch.Size([1536])
generator.encoder.layers.0.self_attn.out_proj.weight 	 torch.Size([512, 5

In [43]:
torch.save(gan.state_dict(), './models/cp_trans_gan_v2_emotion_changes.pt')

## Generate

In [44]:
gan = MidiTransGAN(generator, discriminator, noise_fn, get_batch, device=device)
gan.load_state_dict(torch.load('./models/cp_trans_gan_v2_emotion_changes.pt'))
gan.eval()

MidiTransGAN(
  (generator): Generator(
    (criterion): CrossEntropyLoss()
    (embedding_family): Embedding(3, 32)
    (embedding_bar): Embedding(35, 64)
    (embedding_pitch): Embedding(86, 128)
    (embedding_velocity): Embedding(32, 128)
    (embedding_duration): Embedding(66, 512)
    (embedding_chord): Embedding(16, 128)
    (embedding_rest): Embedding(11, 128)
    (embedding_tempo): Embedding(3, 128)
    (embedding_emotion): Embedding(4, 128)
    (in_linear): Linear(in_features=1376, out_features=512, bias=True)
    (pos_encoder): PositionalEncoding(
      (dropout): Dropout(p=0.2, inplace=False)
    )
    (linear): Linear(in_features=1376, out_features=512, bias=True)
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=2048, bias=Tru

In [45]:
# tensorboard
# https://pytorch.org/tutorials/recipes/recipes/tensorboard_with_pytorch.html?msclkid=ce0b97e5b41911ec9d2e71bb3c7d0f90

In [46]:
# !pip install muspy
import muspy

In [47]:
# TODO: fix the generate sample function to handle batch size = 1
sequences = []
for emo in range(0,1):
    n_generate = 200
    temperature = 1
    log_interval = 4000 # interval between logs

    notes = []
    for token in ntokens[:-1]:
        # print(token)
        notes.append(torch.randint(token, (1, 2), dtype=torch.long).to(device))
    

    emotions = torch.full((1, 2), emo).to(device)
    
    notes.append(emotions)

    # stacked input
    inputs = torch.stack(notes, dim=-1)
    print(len(inputs))
        
    src_mask = generate_square_subsequent_mask(len(inputs)).to(device)

    output = gan.generate_samples(latent_vec=inputs, emotion=emo, num=n_generate, src_mask=None)

    for i in range(len(output)):
        # output[i,:,0] = torch.tensor([tokens_to_raw[0][l] for l in output[i,:,0]])
        for k in range(0,8):
            output[i,:,k] = torch.tensor([tokens_to_raw[k][l] for l in output[i,:,k]])
    # print(output)
    # if i % log_interval == 0:
    print('| Generated {} notes'.format(n_generate))
    sequences.append([output[:,2:,:-1].squeeze().cpu().tolist()])

1
| Generated 200 notes


In [48]:
import muspy

In [49]:
sequences

[[[[2, 207, 78, 121, 132, 225, 245, 265],
   [2, 209, 85, 119, 158, 232, 244, 0],
   [2, 207, 49, 120, 177, 238, 244, 0],
   [2, 1, 42, 108, 177, 232, 244, 265],
   [2, 217, 63, 97, 130, 227, 248, 0],
   [2, 200, 87, 105, 174, 238, 245, 265],
   [3, 209, 84, 100, 169, 236, 251, 265],
   [3, 210, 68, 0, 146, 225, 247, 265],
   [3, 199, 53, 102, 188, 230, 251, 265],
   [2, 217, 41, 117, 177, 238, 249, 265],
   [3, 195, 24, 119, 152, 238, 245, 265],
   [2, 206, 63, 110, 184, 225, 0, 0],
   [3, 205, 62, 116, 145, 231, 251, 265],
   [3, 209, 48, 123, 186, 230, 244, 265],
   [2, 206, 73, 102, 162, 238, 248, 0],
   [3, 216, 13, 102, 131, 225, 248, 265],
   [3, 218, 27, 113, 147, 233, 244, 265],
   [3, 211, 42, 95, 137, 229, 246, 265],
   [3, 219, 39, 100, 161, 230, 242, 265],
   [3, 209, 75, 0, 177, 232, 251, 265],
   [3, 213, 41, 98, 143, 234, 245, 265],
   [3, 192, 42, 102, 133, 228, 248, 265],
   [3, 223, 17, 99, 166, 234, 251, 265],
   [3, 209, 6, 119, 143, 233, 249, 265],
   [2, 223, 19,

In [50]:
date = '13_04_'
pitch_ranges = []
n_pitches = []
polyphonies = []
empty_beat_rates = []
for i,seq in enumerate(sequences):
    # TODO: remove this
    # seq = seq[0]

    converted_back_midi = cp_enc.tokens_to_midi(seq, get_midi_programs(midi))
    file_name = 'cptransgan_v2_emotion_temp_' + date  + str(i+1) + '.mid'
    converted_back_midi.dump(file_name)
    music = muspy.read_midi(file_name)
    pitch_range = muspy.pitch_range(music)
    n_pitches_used = muspy.n_pitches_used(music)
    polyphony = muspy.polyphony(music) # average number of pitches being played concurrently.
    empty_beat_rate = muspy.empty_beat_rate(music)

    # music = muspy.read_midi(file_name)
    pitch_ranges.append(muspy.pitch_range(music))
    n_pitches.append(muspy.n_pitches_used(music))
    polyphonies.append(muspy.polyphony(music)) # average number of pitches being played concurrently.
    empty_beat_rates.append(muspy.empty_beat_rate(music))

cptransgan_v2_emotion_temp_13_04_1.mid


In [65]:
cp_enc.vocab.token_to_event

{0: 'PAD_None',
 1: 'Bar_None',
 2: 'Family_Note',
 3: 'Family_Metric',
 4: 'Pitch_Ignore',
 5: 'Pitch_21',
 6: 'Pitch_22',
 7: 'Pitch_23',
 8: 'Pitch_24',
 9: 'Pitch_25',
 10: 'Pitch_26',
 11: 'Pitch_27',
 12: 'Pitch_28',
 13: 'Pitch_29',
 14: 'Pitch_30',
 15: 'Pitch_31',
 16: 'Pitch_32',
 17: 'Pitch_33',
 18: 'Pitch_34',
 19: 'Pitch_35',
 20: 'Pitch_36',
 21: 'Pitch_37',
 22: 'Pitch_38',
 23: 'Pitch_39',
 24: 'Pitch_40',
 25: 'Pitch_41',
 26: 'Pitch_42',
 27: 'Pitch_43',
 28: 'Pitch_44',
 29: 'Pitch_45',
 30: 'Pitch_46',
 31: 'Pitch_47',
 32: 'Pitch_48',
 33: 'Pitch_49',
 34: 'Pitch_50',
 35: 'Pitch_51',
 36: 'Pitch_52',
 37: 'Pitch_53',
 38: 'Pitch_54',
 39: 'Pitch_55',
 40: 'Pitch_56',
 41: 'Pitch_57',
 42: 'Pitch_58',
 43: 'Pitch_59',
 44: 'Pitch_60',
 45: 'Pitch_61',
 46: 'Pitch_62',
 47: 'Pitch_63',
 48: 'Pitch_64',
 49: 'Pitch_65',
 50: 'Pitch_66',
 51: 'Pitch_67',
 52: 'Pitch_68',
 53: 'Pitch_69',
 54: 'Pitch_70',
 55: 'Pitch_71',
 56: 'Pitch_72',
 57: 'Pitch_73',
 58: 'Pitch_

In [184]:
tokens_to_raw

[tensor([0, 2, 3]),
 tensor([  0,   1, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202,
         203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216,
         217, 218, 219, 220, 221, 222, 223]),
 tensor([ 0,  4,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
         22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
         40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
         58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75,
         76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89]),
 tensor([  0,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105,
         106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
         120, 121, 122, 123]),
 tensor([  0, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
         139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152,
         153, 154, 155, 156, 157, 158,

In [179]:
seq_copies = sequences

In [183]:
seq_copies[seq_copies != 0]

[[[3, 206, 57, 113, 167, 226, 244, 0],
  [2, 217, 25, 123, 140, 238, 244, 0],
  [3, 216, 25, 102, 190, 238, 244, 252],
  [2, 213, 46, 116, 167, 234, 243, 252],
  [3, 213, 34, 116, 173, 234, 244, 0],
  [3, 213, 53, 104, 167, 224, 244, 252],
  [2, 221, 66, 113, 174, 235, 244, 252],
  [2, 192, 66, 111, 167, 237, 244, 252],
  [3, 213, 64, 116, 145, 229, 244, 252],
  [2, 219, 64, 113, 189, 236, 248, 0],
  [3, 216, 53, 116, 153, 238, 250, 252],
  [3, 222, 64, 113, 147, 234, 247, 252],
  [3, 213, 34, 117, 167, 238, 247, 252],
  [3, 213, 73, 112, 140, 0, 249, 252],
  [3, 206, 73, 121, 182, 236, 248, 252],
  [3, 191, 54, 113, 168, 235, 244, 252],
  [3, 214, 32, 113, 167, 236, 249, 252],
  [3, 0, 66, 0, 141, 236, 243, 0],
  [3, 202, 53, 116, 164, 234, 244, 252],
  [3, 210, 11, 113, 167, 229, 247, 252],
  [3, 207, 86, 113, 128, 236, 249, 252],
  [3, 213, 6, 113, 167, 232, 246, 252],
  [3, 206, 80, 121, 141, 235, 244, 252],
  [3, 191, 73, 113, 182, 236, 243, 252],
  [3, 222, 66, 116, 150, 236, 249

In [187]:
count = 0
for seq in sequences:
    for token in seq[0]:
        if token[1] > 191:
            if token[7] == 0 or token[6] == 0 or token[5] == 0:
                count += 1
                

In [188]:
count

2161

In [186]:
for seq in sequences:
    for token in seq[0][9:]:
        print(token)
        print(cp_enc.tokens_to_track([token]))

[3, 206, 43, 113, 176, 234, 244, 265]
(Instrument(program=0, is_drum=False, name="Acoustic Grand Piano"), [TempoChange(tempo=121, time=0)])
[3, 192, 25, 115, 149, 231, 249, 0]


ValueError: invalid literal for int() with base 10: 'None'

In [None]:
sequences = []
date = '29_03_'
pitch_ranges = []
n_pitches = []
polyphonies = []
empty_beat_rates = []

for k in range(25):
    print(k)
    for emo in range(1,5):
        n_generate = 4000
        temperature = 1
        sequence = []
        log_interval = 4000 # interval between logs
        input = torch.randint(218, (1, 2), dtype=torch.long).to(device)
        emotion = torch.zeros((1, 2), dtype=int).to(device)
        emotion[:,0] = emo


        src_mask = generate_square_subsequent_mask(len(input)).to(device)
        with open('./output', 'w') as outf:
            with torch.no_grad():  # no tracking history
                for i in range(n_generate):

                    output, _ = gan.generate_samples(latent_vec=input, emotion=emotion)

                    word_weights = output[-1].squeeze().div(temperature).exp().cpu()
                    word = torch.multinomial(word_weights, 1)[0].tolist()
                    word_tensor = torch.Tensor([word]).long().to(device)
                    
                    input = torch.cat([input, word_tensor], 1)
                    emotion = torch.cat([emotion, torch.zeros((1,1), dtype=int).to(device)], -1)

                    outf.write(str(word) + ('\n' if i % 20 == 19 else ' '))
                    
                    sequence.extend(word)

                    if i % log_interval == 0:
                        print('| Generated {}/{} notes'.format(i, n_generate))
        sequences.append([sequence])
        converted_back_midi = remi_enc.tokens_to_midi([sequence], get_midi_programs(midi))
        file_name = 'transgan_' + date + str(k) + '_' + str(emo) + '.mid'
        converted_back_midi.dump(file_name)

        music = muspy.read_midi(file_name)
        pitch_ranges.append(muspy.pitch_range(music))
        n_pitches.append(muspy.n_pitches_used(music))
        polyphonies.append(muspy.polyphony(music)) # average number of pitches being played concurrently.
        empty_beat_rates.append(muspy.empty_beat_rate(music))

In [None]:
results_transgan = {'Pitch_range': pitch_ranges, 'Num_pitches': n_pitches, 'Polyphony': polyphonies, 'Empty_beat_rates': empty_beat_rates}
results_df = pd.DataFrame(results_transgan)
results_df.to_csv('remi_ransgan_results_v2_emo.csv')

In [None]:
converted_back_midi

ticks per beat: 384
max tick: 0
tempo changes: 1
time sig: 0
key sig: 0
markers: 0
lyrics: False
instruments: 1

## Metrics

### BLEU Score

In [None]:
train_check = train_data[:,:,0]
train_check.shape

torch.Size([10372, 101])

In [None]:
gen_check = []
for sequence in sequences:
    # print(sequence[0])
    for i in range(0, len(sequence[0])-101, 101):
        gen_check.append(sequence[0][i:i+101])

In [None]:
torch.Tensor(gen_check).shape

torch.Size([156, 101])

In [None]:
from nltk.translate.bleu_score import corpus_bleu

score = corpus_bleu([train_check], [torch.Tensor(gen_check)])


0.0


### MusPy metrics

In [None]:
results_df.describe()

Unnamed: 0,Pitch_range,Num_pitches,Polyphony,Empty_beat_rates
count,4.0,4.0,4.0,4.0
mean,85.25,45.75,10.413417,0.038571
std,0.957427,5.5,4.625193,0.053883
min,84.0,41.0,5.19863,0.0
25%,84.75,41.0,7.192225,0.0
50%,85.5,45.5,10.901519,0.02
75%,86.0,50.25,14.122711,0.058571
max,86.0,51.0,14.652,0.114286


In [None]:
music = muspy.read_midi('conditioned_17_03_4.mid')
pitch_range = muspy.pitch_range(music)
n_pitches_used = muspy.n_pitches_used(music)
polyphony = muspy.polyphony(music) # average number of pitches being played concurrently.
empty_beat_rate = muspy.empty_beat_rate(music)

print("The pitch range is", pitch_range)
print("The number of unique pitches used is", n_pitches_used)
print("The polyphony is", polyphony)
print("The empty beat rate is", empty_beat_rate)

## Extra

In [None]:
class MidiBert(nn.Module):
    def __init__(self, bert_model_path, ntokens, hidden_size=200):
        super().__init__()
        
        # self.bert = BertModel(max_position_embeddings= max_position_embeddings, position_embedding_type=position_embedding_type, hidden_size=hidden_size)
        self.bert = BertForSequenceClassification.from_pretrained(bert_model_path, problem_type="multi_label_classification", num_labels = 4)
        self.d_model = 768
        self.hidden_size = hidden_size
        # self.bertConfig = bertConfig

        # token types: [Bar, Position, Pitch, Duration]
        self.n_token = ntokens
        self.emb_size = 256
        
        # word_emb: embeddings to change token ids into embeddings
        self.word_emb = nn.Embedding(self.n_token, self.emb_size) 

        # linear layer to merge embeddings from different token types 
        self.in_linear = nn.Linear(self.emb_size, self.d_model)

        self.proj = nn.Linear(hidden_size, ntokens)


    def forward(self, input_id, attn_mask=None):
        # convert input_ids into embeddings and merge them through linear layer
        emb = self.word_emb(input_id) * math.sqrt(self.d_model)
        # emb_squared = emb 
        emb_linear = self.in_linear(emb)
        
        # feed to bert 
        y = self.bert(inputs_embeds=emb_linear, attention_mask=attn_mask, output_hidden_states=True)
        # y = y.hidden_states[-1]        # (batch_size, seq_len, 768)
        # y = self.proj(y) 
        return y