# Emotion Conditioned Music Generation
This notebook provides the code for implementing a Transformer-GAN for the dissertation. The objective of the model is to produce sentimental music given an input emotion


## Importing libraries
Please install these libraries, especially torch and torch vision since this code runs on Pytorch 1.7.0

In [1]:
# !pip install music21 miditoolkit miditok

In [2]:
# %pip install --user torch==1.7.0 torchvision==0.8.1 -f https://download.pytorch.org/whl/cu102/torch_stable.html

In [None]:
# %pip install torch torchvision

In [186]:
import numpy as np 
import pandas as pd 
from io import open
import tensorflow as tf
import glob
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import time
from miditok import get_midi_programs, REMI, CPWord
from miditoolkit import MidiFile
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler

In [187]:
torch.__version__

'1.7.0'

In [188]:
device = 'cuda'

In [189]:
torch.cuda.empty_cache()

In [190]:
torch.cuda.is_available()

True

In [191]:
# Seed
# seed = 22
# torch.manual_seed(seed)
# torch.cuda.manual_seed(seed)
# torch.cuda.manual_seed_all(seed)
# np.random.seed(seed)
# torch.backends.cudnn.benchmark = False
# torch.backends.cudnn.deterministic = True

## Loading the Dataset

In [192]:
# how a midi file looks like
midi = MidiFile('archive/EMOPIA_1.0 (1)/EMOPIA_1.0/midis/Q1__8v0MFBZoco_0.mid')
midi

ticks per beat: 384
max tick: 46051
tempo changes: 1
time sig: 1
key sig: 0
markers: 0
lyrics: False
instruments: 1

In [193]:
# for now, we will only be using for piano right since it determines the melody
midi.instruments

[Instrument(program=0, is_drum=False, name="")]

In [194]:
# file path to the MIDI files
files_paths = list(glob.glob('archive/EMOPIA_1.0 (1)/EMOPIA_1.0/midis/*.mid'))
# reading labels
labels_df = pd.read_csv('archive/EMOPIA_1.0 (1)/EMOPIA_1.0/label.csv')
labels_df = list(labels_df['4Q'])

In [195]:
import muspy

def return_range(music):
    h = 0
    l = 127
    for track in music.tracks:
        for note in track.notes:
            if note.pitch > h:
                h = note.pitch
            if note.pitch < l:
                l = note.pitch
    return [h, l]

tempos = []
pitches = []

for file in files_paths:
    music = muspy.read_midi(file)
    tempos.append(music.tempos[0].qpm)
    pitches.extend(return_range(music))

In [196]:
print("The unique tempos found in the dataset are:", set(tempos))
print('minimum pitch found', min(pitches))
print('maximum pitch found', max(pitches))

The unique tempos found in the dataset are: {120.0}
minimum pitch found 22
maximum pitch found 105


In [197]:
pitch_range = range(22, 105)
additional_tokens = {'Chord': True, 'Rest': True, 'Tempo': True, 'Program': False,
                     'rest_range': (2, 4),  # (half, 8 beats)
                     'nb_tempos': 32,  # nb of tempo bins
                     'tempo_range': (100, 140),
                     'TimeSignature':None}  # (min, max)

In [198]:
# create a list of notes
# this stores the REMI encoded tokens of the midi files

def load_files(files_paths, encoder = REMI(additional_tokens)):
    assert len(files_paths) > 0
    notes = []


    for file in files_paths:
        # file_name = os.path.basename(file)

        # read the MIDI file
        midi = MidiFile(file)

        # Converts MIDI to tokens
        tokens = encoder.midi_to_tokens(midi)
        
        # The EMOPIA dataset has midi files with only one instrument, i.e. the piano 
        # hence we just add those tokens
        # print(tokens)
        notes.append(tokens[0])

    return notes, encoder

In [199]:
notes, cp_enc = load_files(files_paths, CPWord(pitch_range, additional_tokens = additional_tokens))

In [200]:
print("There are",len(cp_enc.vocab),"unique tokens in the files")

There are 276 unique tokens in the files


In [201]:
# Create a dataset corpus from the notes and labels
class REMICorpus(Dataset):
    def __init__(self, notes, labels, encoder, seq_length):
        self.encoder = encoder
        self.seq_len = seq_length

        # ntrain, ntest, ltrain, ltest = train_test_split(notes, labels, test_size=split_size, random_state=42, shuffle=True, stratify=labels)
 
        self.xtrain, self.ytrain, self.raw_to_enc, self.enc_to_raw = self.tokenize(notes, labels)
        # self.xvalid = self.tokenize(ntest, ltest)
    
    def __len__(self):
        return len(self.encoder.vocab)

    def len_dataset(self):
        return len(self.xtrain)
    
    def __getitem__(self, index):
        return self.xtrain[index], self.ytrain[index]
    
    def tokenize(self, notes, labels):
        assert len(notes) > 0
        assert len(labels) > 0

        # create a set of notes
        # they should all be padded to have sequence of len seq_len
        songss = []
        labelss = []

        for song, label in zip(notes, labels):
            song = torch.tensor(song).type(torch.int64)
            songs = list(song.split(self.seq_len))

            for i in range(len(songs)):
                # removing sequences that have < seq len/4 tokens
                if len(songs[i]) < self.seq_len/4:
                    del songs[i]
                    continue
                labelss.append(label-1)
            songss.extend(songs)
        
        # padding songs to be of same length
        songs = pad_sequence(songss)

        corpus = []

        # adding emotion values to the sequences
        for song, label in zip(songs.view(songs.size(1), songs.size(0), songs.size(2)), labelss):
            l = torch.full((self.seq_len,1), label)
            inp = torch.cat([song, l], dim=-1)
            corpus.append(inp)

        corpus = torch.stack(corpus)

        # creates the range of each type of token
        # for eg. family is [0, 2, 3]
        token_ranges = [corpus[:,:,i].squeeze().unique() for i in range(8)]
        
        # creates a reverse dictionary for each token
        # for eg. family is {0: 0, 2: 1, 3: 2}
        token_dicts = [dict(zip(tokens.tolist(), range(len(tokens)))) for tokens in token_ranges]

        new_corpus = corpus.clone().detach()
        for i in range(len(corpus)):
            for k in range(8):
                new_corpus[i,:,k] = torch.tensor([token_dicts[k][l.item()] for l in corpus[i,:,k]])

        data = new_corpus[:,:self.seq_len - 1, :]
        target = new_corpus[:,1:self.seq_len, :]
            

        # converting all the tokens in each type to new values:
        return data, target, token_ranges, token_dicts

In [202]:
corpus = REMICorpus(notes, labels_df, cp_enc, 21)

In [203]:
train_target = corpus.ytrain
train_data = corpus.xtrain

tokens_to_raw = corpus.raw_to_enc
raw_to_tokens = corpus.enc_to_raw

# train_emo = corpus.ytrain.to(device)
# val_emo = corpus.yvalid.to(device)

print("train data shape:", train_data.shape)
print("train target shape:", train_target.shape)

train data shape: torch.Size([22241, 20, 9])
train target shape: torch.Size([22241, 20, 9])


In [204]:
batch_size = 32
# creating a dataloader
train_dataloader = DataLoader(
    corpus,
    sampler=SequentialSampler(train_data),
    batch_size=batch_size,
)

In [205]:
print("There are total",len(notes), "songs and a total of", train_data.shape[0], "sequences extracted")

There are total 1078 songs and a total of 22241 sequences extracted


In [206]:
print("There are",len(corpus), "unique tokens")

There are 276 unique tokens


In [207]:
ntokens = []
for i in range(9):
    # and the number of tokens per type
    ntokens.append(len(train_data[:,:,i].squeeze().unique()))

In [208]:
print("There are", ntokens[0], "family tokens")
print("There are", ntokens[1], "bar/position tokens")
print("There are", ntokens[2], "pitch tokens")
print("There are", ntokens[3], "velocity tokens")
print("There are", ntokens[4], "duration tokens")
print("There are", ntokens[5], "chord tokens")
print("There are", ntokens[6], "rest tokens")
print("There are", ntokens[7], "tempo tokens")
print("There are", ntokens[8], "emotion tokens")

There are 3 family tokens
There are 35 bar/position tokens
There are 85 pitch tokens
There are 32 velocity tokens
There are 66 duration tokens
There are 16 chord tokens
There are 7 rest tokens
There are 3 tempo tokens
There are 4 emotion tokens


## Model Building

### Constants

In [209]:
# size of the model
emsize = 256

# parameters for the transformers
nhead = 4
nhid = 512
nlayer = 4

# dropout
dropout = 0.4

# learning rates for each
lr_g = 0.0001
lr_d = 0.0001


### Position Encoding

In [210]:
# adapted from the pytorch positional encoding class
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # PE is the Positional Encoding matrix 
        # THIS STORES THE POSITIONS OF THE SEQUENCE
        pe = torch.zeros(max_len, d_model)

        # Arange - RETURNS A RANGE BETWEEN VALUES, HERE IT IS 0 - max_len
        # unsqueeze - adds a dimension, 1 means that each element in the first list is now in a list
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        # division term, here it is (10000 ** ((2 * i)/d_model))
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        # calculating the position encoding for the even and odd terms        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        # Unsqueeze 0 will put PE in one list
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # make embeddings relatively larger
        # This is so we do not lose the importance of the embedding
        # we add the embedding to the PE 
        # print(x.shape)
        # print(self.pe.shape)
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)

In [211]:
class Generator(nn.Module):
    """Container module with an encoder, a recurrent or transformer module, and a decoder."""

    def __init__(self, ntoken, d_model, nhead, nlayers, dropout=0.5, max_length = 2048, device = device):
        super(Generator, self).__init__()
        try:
            from torch.nn import TransformerEncoder, TransformerEncoderLayer
        except:
            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')

        # original mask
        self.src_mask = None
        self.max_length = max_length
        self.d_model = d_model
        self.nlayers = nlayers
        self.ntokens = ntoken

        self.device = device

        # NEW criterion and embedding size
        self.criterion = nn.CrossEntropyLoss(reduction='none')
        # There are 3 family tokens
        # There are 35 bar/position tokens
        # There are 85 pitch tokens
        # There are 32 velocity tokens
        # There are 66 duration tokens
        # There are 16 chord tokens
        # There are 7 rest tokens
        # There are 3 tempo tokens
        # There are 4 emotion tokens
        # the embedding sizes are reflectibe of the number of tokens
        self.embed_siz = [32, 128, 512, 128, 256, 64, 64, 32, 512]

        # embedding encoding
        self.embedding_family  = nn.Embedding(self.ntokens[0], self.embed_siz[0])
        self.embedding_bar  = nn.Embedding(self.ntokens[1], self.embed_siz[1])
        self.embedding_pitch  = nn.Embedding(self.ntokens[2], self.embed_siz[2])
        self.embedding_velocity  = nn.Embedding(self.ntokens[3], self.embed_siz[3])
        self.embedding_duration  = nn.Embedding(self.ntokens[4], self.embed_siz[4])
        self.embedding_chord  = nn.Embedding(self.ntokens[5], self.embed_siz[5])
        self.embedding_rest  = nn.Embedding(self.ntokens[6], self.embed_siz[6])
        self.embedding_tempo  = nn.Embedding(self.ntokens[7], self.embed_siz[7])
        self.embedding_emotion   = nn.Embedding(self.ntokens[8], self.embed_siz[8])

        # this to project the concatenated input to a uniform d_model space
        self.in_linear = nn.Linear(np.sum(self.embed_siz), self.d_model)
        
        # positional encoding
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        
        # encoder
        encoder_layer = TransformerEncoderLayer(d_model = d_model, nhead = nhead, dropout = dropout)
        self.encoder = TransformerEncoder(encoder_layer, nlayers)

        # output layers
        self.project_family = nn.Linear(d_model, ntoken[0])
        self.project_bar = nn.Linear(d_model, ntoken[1])
        self.project_pitch = nn.Linear(d_model, ntoken[2])
        self.project_velocity = nn.Linear(d_model, ntoken[3])
        self.project_duration = nn.Linear(d_model, ntoken[4])
        self.project_chord = nn.Linear(d_model, ntoken[5])
        self.project_rest = nn.Linear(d_model, ntoken[6])
        self.project_tempo = nn.Linear(d_model, ntoken[7])
        self.project_emo = nn.Linear(d_model, ntoken[8])

        # size is the d model plus the type
        self.proj_cat = nn.Linear(d_model + self.embed_siz[0], d_model)
        
        self.init_weights()
            
    def _generate_square_subsequent_mask(self, sz):
        return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

    def init_weights(self):
        initrange = 0.1

        nn.init.uniform_(self.embedding_family.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_bar.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_pitch.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_velocity.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_duration.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_chord.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_rest.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_tempo.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_emotion.weight, -initrange, initrange)
     
        self.in_linear.bias.data.zero_()
        self.in_linear.weight.data.uniform_(-initrange, initrange)
        self.project_family.bias.data.zero_()
        self.project_family.weight.data.uniform_(-initrange, initrange)
        self.project_bar.bias.data.zero_()
        self.project_bar.weight.data.uniform_(-initrange, initrange)
        self.project_pitch.bias.data.zero_()
        self.project_pitch.weight.data.uniform_(-initrange, initrange)
        self.project_velocity.bias.data.zero_()
        self.project_velocity.weight.data.uniform_(-initrange, initrange)
        self.project_duration.bias.data.zero_()
        self.project_duration.weight.data.uniform_(-initrange, initrange)
        self.project_chord.bias.data.zero_()
        self.project_chord.weight.data.uniform_(-initrange, initrange)
        self.project_rest.bias.data.zero_()
        self.project_rest.weight.data.uniform_(-initrange, initrange)
        self.project_tempo.bias.data.zero_()
        self.project_tempo.weight.data.uniform_(-initrange, initrange)
        self.project_emo.bias.data.zero_()
        self.project_emo.weight.data.uniform_(-initrange, initrange)

    def forward(self, x_family, x_bar, x_pitch, x_velocity, x_duration, x_chord ,x_rest, x_tempo, x_emo, src_mask):
        # creating embedding for all tokens and emotions
        x_family = self.embedding_family(x_family)
        x_bar = self.embedding_bar(x_bar)
        x_pitch = self.embedding_pitch(x_pitch)
        x_velocity = self.embedding_velocity(x_velocity)
        x_duration = self.embedding_duration(x_duration)
        x_chord = self.embedding_chord(x_chord)
        x_rest = self.embedding_rest(x_rest)
        x_tempo = self.embedding_tempo(x_tempo)
        # print(x_emo.shape)
        # print(x_emo)
        x_emo = self.embedding_emotion(x_emo)

        # normalising the input for the position encoding
        x_family = x_family * math.sqrt(self.d_model)
        x_bar = x_bar * math.sqrt(self.d_model)
        x_pitch = x_pitch * math.sqrt(self.d_model)
        x_velocity = x_velocity * math.sqrt(self.d_model)
        x_duration = x_duration * math.sqrt(self.d_model)
        x_chord = x_chord * math.sqrt(self.d_model)
        x_rest = x_rest * math.sqrt(self.d_model)
        x_tempo = x_tempo * math.sqrt(self.d_model)
        x_emo = x_emo * math.sqrt(self.d_model)

        # concatenating as one input
        x = torch.cat([x_family, x_bar, x_pitch, x_velocity, x_duration, x_chord, x_rest, x_tempo, x_emo], dim=-1)

        # sending through linear layer
        x = self.in_linear(x)

        x = self.pos_encoder(x)

        if src_mask == None:
            src_mask = self._generate_square_subsequent_mask(x.size(1)).to(self.device)
            
        self.src_mask = src_mask

        output = self.encoder(x.view(x.size(1), x.size(0), x.size(2)), self.src_mask)

        # first get the family of the tokens
        y_family = self.project_family(output)

        # getting the y type again from the probabilities
        type_prob = F.softmax(y_family, dim=-1)
        n,s,t = type_prob.shape
        y_type = torch.multinomial(type_prob.view(-1, t), 1, replacement=True).view(n, s)

        # this is usally for target family type, which is the same as source
        tf_skip_family = self.embedding_family(y_type)

        y_concat_family = torch.cat([output, tf_skip_family], dim=-1)

        # creating a concatenated projection
        y_ = self.proj_cat(y_concat_family)

        # projecting for each token
        y_bar = self.project_bar(y_)
        y_pitch = self.project_pitch(y_)
        y_velocity = self.project_velocity(y_)
        y_duration = self.project_duration(y_)
        y_chord = self.project_chord(y_)
        y_rest = self.project_rest(y_)
        y_tempo = self.project_tempo(y_)
        y_emo = self.project_emo(y_)

        outputs = [y_family, y_bar, y_pitch, y_velocity, y_duration, y_chord, y_rest, y_tempo]

        return outputs, y_emo

In [212]:
class Discriminator(nn.Module):
    """
    Discriminator based on a pytorch TransformerEncoder.
    """
    
    def __init__(self, ntokens, d_model, nhead, nhid, nlayers, dropout=0.5, max_length = 2048):
        super(Discriminator, self).__init__()
        
        try:
            from torch.nn import TransformerEncoder, TransformerEncoderLayer, TransformerDecoderLayer, TransformerDecoder
        except:
            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')

        self.d_model = d_model

        # default embedding sizes:
        self.embed_siz = [32, 128, 512, 128, 256, 64, 64, 32, 512]
        self.ntokens = ntokens
        # embedding encoding
        # print(self.ntokens[0], self.embed_siz[0])
        self.embedding_family  = nn.Embedding(self.ntokens[0], self.embed_siz[0])
        self.embedding_bar  = nn.Embedding(self.ntokens[1], self.embed_siz[1])
        self.embedding_pitch  = nn.Embedding(self.ntokens[2], self.embed_siz[2])
        self.embedding_velocity  = nn.Embedding(self.ntokens[3], self.embed_siz[3])
        self.embedding_duration  = nn.Embedding(self.ntokens[4], self.embed_siz[4])
        self.embedding_chord  = nn.Embedding(self.ntokens[5], self.embed_siz[5])
        self.embedding_rest  = nn.Embedding(self.ntokens[6], self.embed_siz[6])
        self.embedding_tempo  = nn.Embedding(self.ntokens[7], self.embed_siz[7])
        self.embedding_emotion   = nn.Embedding(self.ntokens[8], self.embed_siz[8])
        
        # linear layer for converting the extra dimension to a linear vector
        self.linear = nn.Linear(np.sum(self.embed_siz), self.d_model)
        
        # encoding positional information using position encoder
        # with default drop out of 0.2
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        
        # encoding layers
        encoder_layers = TransformerEncoderLayer(d_model, nhead, nhid, dropout)
        self.encoder = TransformerEncoder(encoder_layers, nlayers)

        # out linear
        # self.out_linear = nn.Linear()
        
        # final classification layer
        self.classifier = nn.Linear(d_model, 1)

        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.embedding_family.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_bar.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_pitch.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_velocity.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_duration.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_chord.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_rest.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_tempo.weight, -initrange, initrange)
        nn.init.uniform_(self.embedding_emotion.weight, -initrange, initrange)
     
     
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-initrange, initrange)
        self.classifier.bias.data.zero_()
        self.classifier.weight.data.uniform_(0, initrange)
        # self.project_emo.bias.data.zero_()
        # self.project_emo.weight.data.uniform_(-initrange, initrange)
    def forward(self, x_family  = None, x_bar  = None, x_pitch  = None, x_velocity  = None, x_duration = None, x_chord = None ,x_rest = None, x_tempo = None, x_emo = None, embs = None, token = None):
        # creating embedding for all tokens and emotions
        # print(x_family.shape)
        # print(x_family)
        # print()
    
        if token == 0:
            x_family = embs * math.sqrt(self.d_model)
        else:
            
            x_family = self.embedding_family(x_family)
            x_family = x_family * math.sqrt(self.d_model)
        if token == 1:
            x_bar = embs * math.sqrt(self.d_model)
        else:
            x_bar = self.embedding_bar(x_bar)        
            x_bar = x_bar * math.sqrt(self.d_model)
        if token == 2:
            x_pitch = embs * math.sqrt(self.d_model)
        else:
            x_pitch = self.embedding_pitch(x_pitch)
            x_pitch = x_pitch * math.sqrt(self.d_model)
        if token == 3:
            x_velocity = embs * math.sqrt(self.d_model)
        else:
            x_velocity = self.embedding_velocity(x_velocity)
            x_velocity = x_velocity * math.sqrt(self.d_model)
        if token == 4:
            x_duration = embs * math.sqrt(self.d_model)
        else:
            x_duration = self.embedding_duration(x_duration)
            x_duration = x_duration * math.sqrt(self.d_model)
        if token == 5:
            x_chord = embs * math.sqrt(self.d_model)
        else:
            x_chord = self.embedding_chord(x_chord)
            x_chord = x_chord * math.sqrt(self.d_model)
        if token == 6:
            x_rest = embs * math.sqrt(self.d_model)
        else:
            x_rest = self.embedding_rest(x_rest)
            x_rest = x_rest * math.sqrt(self.d_model)
        if token == 7:
            x_tempo = embs * math.sqrt(self.d_model)
        else:
            x_tempo = self.embedding_tempo(x_tempo)
            x_tempo = x_tempo * math.sqrt(self.d_model)
        if token == 8:
            x_emo = embs * math.sqrt(self.d_model)
        else:
            x_emo = self.embedding_emotion(x_emo)
            x_emo = x_emo * math.sqrt(self.d_model)
        

        # concatenating as one input
        x = torch.cat([x_family, x_bar, x_pitch, x_velocity, x_duration, x_chord, x_rest, x_tempo, x_emo], dim=-1)
        # print(x.shape, "disc shape for linear")
        # print(x.shape)
        # sending through linear layer
        x = self.linear(x)

        # encoding positions
        x = self.pos_encoder(x)

        # sending through transformer encoder
        x = self.encoder(x)
        
        # classification
        x = x.mean(dim=1)
        x = self.classifier(x)
        return nn.Sigmoid()(x)
        

In [213]:
dictionary = cp_enc.vocab.token_to_event

In [214]:
dictionary

{0: 'PAD_None',
 1: 'Bar_None',
 2: 'Family_Note',
 3: 'Family_Metric',
 4: 'Pitch_Ignore',
 5: 'Pitch_22',
 6: 'Pitch_23',
 7: 'Pitch_24',
 8: 'Pitch_25',
 9: 'Pitch_26',
 10: 'Pitch_27',
 11: 'Pitch_28',
 12: 'Pitch_29',
 13: 'Pitch_30',
 14: 'Pitch_31',
 15: 'Pitch_32',
 16: 'Pitch_33',
 17: 'Pitch_34',
 18: 'Pitch_35',
 19: 'Pitch_36',
 20: 'Pitch_37',
 21: 'Pitch_38',
 22: 'Pitch_39',
 23: 'Pitch_40',
 24: 'Pitch_41',
 25: 'Pitch_42',
 26: 'Pitch_43',
 27: 'Pitch_44',
 28: 'Pitch_45',
 29: 'Pitch_46',
 30: 'Pitch_47',
 31: 'Pitch_48',
 32: 'Pitch_49',
 33: 'Pitch_50',
 34: 'Pitch_51',
 35: 'Pitch_52',
 36: 'Pitch_53',
 37: 'Pitch_54',
 38: 'Pitch_55',
 39: 'Pitch_56',
 40: 'Pitch_57',
 41: 'Pitch_58',
 42: 'Pitch_59',
 43: 'Pitch_60',
 44: 'Pitch_61',
 45: 'Pitch_62',
 46: 'Pitch_63',
 47: 'Pitch_64',
 48: 'Pitch_65',
 49: 'Pitch_66',
 50: 'Pitch_67',
 51: 'Pitch_68',
 52: 'Pitch_69',
 53: 'Pitch_70',
 54: 'Pitch_71',
 55: 'Pitch_72',
 56: 'Pitch_73',
 57: 'Pitch_74',
 58: 'Pitch_

In [215]:
class MidiTransGAN(nn.Module):
    def __init__(self, generator, discriminator, noise_fn,
                 batch_size=2, device='cuda', lr_d=lr_d, lr_g=lr_g):
        """A GAN class for holding and training a generator and discriminator
        Args:
            generator: a Ganerator network
            discriminator: A Discriminator network
            noise_fn: function f(num: int) -> pytorch tensor, (latent vectors)
            data_fn: function f(num: int) -> pytorch tensor, (real samples)
            batch_size: training batch size
            device: cpu or CUDA
            lr_d: learning rate for the discriminator
            lr_g: learning rate for the generator
        """
        super(MidiTransGAN, self).__init__()
        self.generator = generator.to(device)
        # self.generator = self.generator.to(device)
        self.discriminator = discriminator.to(device)
        # self.discriminator = self.discriminator.to(device)
        self.noise_fn = noise_fn
        self.batch_size = batch_size
        self.device = device
        self.criterion = nn.CrossEntropyLoss(reduction='mean')
        self.optim_d = torch.optim.Adam(discriminator.parameters(), lr=lr_d)
        self.optim_g = torch.optim.Adam(generator.parameters(), lr=lr_g)
        self.seq_len = 100
        # self.src_mask = self.src_mask = torch.triu(torch.ones(511, 511) * float('-inf'), diagonal=1).to(device)

    def compute_accuracy(self, predicted_weights, target):
        predicted = predicted_weights.argmax(dim=1)
        return torch.sum(predicted == target) / len(target)

    def calc_gradient_penalty(self, real, fake, LAMBDA=0.02):

        embedding = [self.discriminator.embedding_family.weight, self.discriminator.embedding_bar.weight, self.discriminator.embedding_pitch.weight, self.discriminator.embedding_velocity.weight, self.discriminator.embedding_duration.weight, self.discriminator.embedding_chord.weight, self.discriminator.embedding_rest.weight, self.discriminator.embedding_tempo.weight, self.discriminator.embedding_emotion.weight]
        
        penalties = []
        for i in range(9):
            temp = torch.rand([real.shape[0], 1]).to(device)

            # interpolation
            mid = temp * real[:,:,i] + ((1 - temp) * fake[:,:,i])
            mid = mid.type(torch.LongTensor)
            mid = mid.type(torch.FloatTensor).to(device)
            mid = torch.autograd.Variable(mid, requires_grad=True)    
            mid = torch.einsum(
                "ve,bn -> bne",
                embedding[i],
                mid,
            )
        
            # print(mid.type(torch.LongTensor))
            classification = self.discriminator(real[:,:,0].to(device), real[:,:,1].to(device), real[:,:,2].to(device), real[:,:,3].to(device), real[:,:,4].to(device), real[:,:,5].to(device), real[:,:,6].to(device), real[:,:,7].to(device), real[:,:,8].to(device), embs = mid, token = i)
            

            gradients = torch.autograd.grad(outputs=classification, inputs=mid,
                                            grad_outputs=torch.ones(classification.size(), device=device),
                                            create_graph=True, retain_graph=True, allow_unused = True)[0]
            # print(gradients)
            gradients = gradients.view(real.shape[0], -1)

            # https://github.com/igul222/improved_wgan_training/blob/master/gan_language.py
            slopes = torch.sqrt(torch.sum(gradients ** 2, dim=1) + 1e-12)
            gradient_penalty = ((slopes - 1.) ** 2).mean() 

            penalties.append(gradient_penalty)

        # just adding can explode gradients
        return torch.mean(torch.Tensor(penalties)) * LAMBDA

    def weighted_sampling(self, probs):
        probs /= sum(probs)
        sorted_probs = np.sort(probs)[::-1]
        sorted_index = np.argsort(probs)[::-1]
        word = np.random.choice(sorted_index, size=1, p=sorted_probs)[0]
        return word

    def sampling(self, logits, p=None, t=1.0):
        logits = logits[-1].squeeze().cpu().numpy()
        probs = np.exp(logits / t) / np.sum(np.exp(logits / t))
        # print(probs)
        cur_word = self.weighted_sampling(probs)
        return cur_word

    # Cross Entropy loss with label smoothing
    def label_smoothing_loss(self, x, target, smoothing = 0.5):
        confidence = 1 - smoothing
        logprobs = F.log_softmax(x, dim=-1)
        nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
        nll_loss = nll_loss.squeeze(1)
        smooth_loss = -logprobs.mean(dim=-1)
        loss = confidence * nll_loss + smoothing * smooth_loss
        return loss.mean()

    def wgan_loss(self, x):
        return torch.mean(x)

    def generate_samples(self, latent_vec=None, emotion=None, num=None, src_mask = None, display = False):
        """Sample from the generator.
        Args:
            latent_vec: A pytorch latent vector or None
            num: The number of samples to generate if latent_vec is None
        If latent_vec and num are None then use self.batch_size random latent
        vectors.
        """
        num = self.batch_size if num is None else num
        latent_vec = self.noise_fn(self.seq_len,1, emotion) if latent_vec is None else latent_vec

        # 2 for note and 3 for metric

        if emotion == None:
            emotion = latent_vec[:,:,8][0][0]
        
        if src_mask == None:
            src_mask = generate_square_subsequent_mask((latent_vec.size(0))).to(device)
        # print(src_mask.shape)

        # since we are not training
        # we fix no gradients
        with torch.no_grad():
            # learning for 3 sequences at a time
            # this is purely due to resource constraints
            count = 0
            while(count <= num):

                # generating fake samples
                fake_samples, _ = generator(latent_vec[:,:,0], latent_vec[:,:,1], latent_vec[:,:,2], latent_vec[:,:,3], latent_vec[:,:,4], latent_vec[:,:,5], latent_vec[:,:,6], latent_vec[:,:,7], latent_vec[:,:,8], src_mask = None)
                
                cur_family =    self.sampling(fake_samples[0], t=1)
                cur_bar =  self.sampling(fake_samples[1], t=1)
                cur_pitch =    self.sampling(fake_samples[2], t=1)
                cur_velocity =    self.sampling(fake_samples[3], t=2)
                cur_duration = self.sampling(fake_samples[4], t=2)
                cur_chord =    self.sampling(fake_samples[5], t=2)
                cur_rest =    self.sampling(fake_samples[6], t=1)
                cur_tempo = self.sampling(fake_samples[7], t=1)

                # getting the original token IDs
                cur_family_corrected = tokens_to_raw[0][cur_family.item()].item()
                cur_bar_corrected = tokens_to_raw[1][cur_bar.item()].item()
                cur_pitch_corrected = tokens_to_raw[2][cur_pitch.item()].item()
                cur_velocity_corrected = tokens_to_raw[3][cur_velocity.item()].item()
                cur_duration_corrected = tokens_to_raw[4][cur_duration.item()].item()
                cur_chord_corrected = tokens_to_raw[5][cur_chord.item()].item()
                cur_rest_corrected = tokens_to_raw[6][cur_rest.item()].item()
                cur_tempo_corrected = tokens_to_raw[7][cur_tempo.item()].item()

                good_token = False
                # if it is a note family
                if cur_family_corrected == 2:
                    # if this does not contain any ignores
                    # The ignore tokens are as follows:
                    # 4: Pitch Ignore
                    # 93: Veloctiy Ignore
                    # 126: Duration Ignore 
                    # 191: Position Ignore
                    
                    # if it does not contain any ignores, it is a perfect prediction, hence we can
                    # use this as the last note
                    together = [cur_pitch_corrected, cur_velocity_corrected, cur_duration_corrected]
                    if not (set([0,4,88,121]) & set(together)):
                        count += 1
                        good_token = True
                    
                elif cur_family_corrected == 3:
                    # if this does not contain any ignores
                    # The ignore tokens are as follows:
                    # 224: Chord Ignore
                    # 242: Rest Ignore
                    # 248: Tempo Ignore 

                    together = [cur_chord_corrected, cur_rest_corrected, cur_tempo_corrected, cur_bar_corrected]
                    if not (set([0, 243]) & set(together)):
                        # next_tokens = torch.LongTensor([cur_family, cur_bar, cur_pitch, cur_velocity, cur_duration, cur_chord, cur_rest, cur_tempo, emotion])
                        count += 1
                        good_token = True

                # next_tokens = torch.LongTensor([cur_family, cur_bar, cur_pitch, cur_velocity, cur_duration, cur_chord, cur_rest, cur_tempo, emotion])
                if good_token:
                    next_tokens = torch.LongTensor([cur_family, cur_bar, cur_pitch, cur_velocity, cur_duration, cur_chord, cur_rest, cur_tempo, emotion])
                    
                    if(display):
                        print('| ', dictionary[cur_family_corrected], dictionary[cur_bar_corrected], dictionary[cur_pitch_corrected], dictionary[cur_velocity_corrected], dictionary[cur_duration_corrected], dictionary[cur_chord_corrected], dictionary[cur_rest_corrected], dictionary[cur_tempo_corrected])
        
                    latent_vec = torch.cat([latent_vec, next_tokens.view(1,1,next_tokens.size(0)).to(device)], dim=1)
            
        return latent_vec

    def train_step_generator(self, real_samples, real_target):
        """Train the generator one step and return the loss."""
        self.generator.zero_grad()
        self.optim_g.zero_grad()

        # latent_vec = self.noise_fn(10,self.batch_size)
        # latent_vec = latent_vec.to(device)
        # real_samples, real_target = self.data_fn(train_data, self.batch_size)
        emotions = real_samples[:,:,8].T[0]
        real_samples = real_samples.to(device)
        # emotion = self.emotions[:,:self.batch_size].to(device)
        
        # generated samples
        # starting with a sequence of length 4
        latent_vec = self.noise_fn(20,real_samples.size(0), emotions)
        target = latent_vec[:,:,8].T[0]
        loss_emotions = 0
        acc_emotions = 0
        nll_loss = 0
        nll_loss_emotion = 0.0
        # since we are not traning generator
        # we fix no gradients

        # learning for 10 length sequences at a time
        # this is purely due to resource constraints
        for i in range(20):

            # generating fake samples
            fake_samples, out_emo = generator(latent_vec[:,:,0], latent_vec[:,:,1], latent_vec[:,:,2], latent_vec[:,:,3], latent_vec[:,:,4], latent_vec[:,:,5], latent_vec[:,:,6], latent_vec[:,:,7], latent_vec[:,:,8], src_mask = None)
            

            # getting the weights and converting them to notes
            emo_weights = out_emo.mean(dim=0)
            loss_emotion = self.criterion(emo_weights, target)
            acc_emotion = self.compute_accuracy(emo_weights, target)
            loss_emotions += loss_emotion
            acc_emotions += acc_emotion

            word_tensor = []
            for k, output in enumerate(fake_samples):

                if i == 0:
                    # print(output.shape)
                    # print(real_target.shape)
                    nll_loss += self.criterion(output.view(output.size(1), output.size(2), output.size(0)).cpu(), real_target[:,:,k].cpu())
                    
                # For Notes:
                # getting the weights and converting them to notes
                # print(output.shape)
                output = F.log_softmax(output, dim=-1)
                word_weights = output[-1].squeeze().exp().cpu()
                
            # for Emotions:
                # getting the values from the distribution from 218 (num of possible notes)
                word = torch.multinomial(word_weights, 1)
                # batch size * 1 -> 1 * batch_size
                # word_notes = word.view(1, word.size(0))
                # print(word_notes.shape)
                word_notes = word.view(word.size(0), 1)

                word_tensor.append(word_notes.to(device))
                # = torch.stack([word_notes, word_tensor], dim=-1)

            if i == 0:
                nll_loss_emotion = self.criterion(out_emo.view(out_emo.size(1), out_emo.size(2), out_emo.size(0)).cpu(), real_target[:,:,8].cpu())
                nll_loss += nll_loss_emotion
            # emotions = torch.full((word_notes.size(0),1), emotion)
            word_tensor.append(target.view(target.size(0), 1).to(device))

            # stack the emotions to the final shape: seq_len * batch_size * 2 (1 for emotion and 1 for notes)
            # here seq_len = 1
            # word_tensor.append(emotions)
            word_tensor = torch.stack(word_tensor, dim=-1)

            latent_vec = torch.cat([latent_vec[:,1:,:], word_tensor.to(device)], dim=1)
        
        
        classifications = self.discriminator(latent_vec[:,:,0].to(device), latent_vec[:,:,1].to(device), latent_vec[:,:,2].to(device), latent_vec[:,:,3].to(device), latent_vec[:,:,4].to(device), latent_vec[:,:,5].to(device), latent_vec[:,:,6].to(device), latent_vec[:,:,7].to(device), latent_vec[:,:,8].to(device))
        # nll_loss_emotion += self.criterion(out_emo.view(out_emo.size(1), out_emo.size(2), out_emo.size(0)).cpu(), real_target[:,:,8].cpu())
        # loss for generator
        # loss_gen = - self.criterion(classifications, torch.zeros((classifications.size(0)), dtype=torch.int64).to(device))
        loss_gen = -torch.mean(classifications, dim=1)
        loss_gen = torch.mean(classifications)# batch size
        # print(out_emo)
        # loss for emotions
        loss_emotions = loss_emotions / 20
        acc_emotions = acc_emotions / 20
        nll_loss = nll_loss / 9
        # loss_emo = self.criterion(out_emo, torch.full((out_emo.size(0), out_emo.size(1)), emotion.item()))
        # loss = (loss_gen + loss_emotions) / 2
        loss = loss_gen
        # print(nll_loss)
        # print(loss_gen)
        # loss_gen.retain_grad()
        loss.backward()
        # loss_emotions.backward(retain_graph=True)
        nll_loss.backward()
        nn.utils.clip_grad_norm_(generator.parameters(), 3)
        self.optim_g.step()
        return loss.item(), acc_emotions, nll_loss.item(), nll_loss_emotion.item()

    def train_step_discriminator(self, real_samples, real_target):
        """Train the discriminator one step and return the losses."""
        self.discriminator.zero_grad()
        self.optim_d.zero_grad()

        # getting real samples
        # this is using the data_fn or the get batch function
        # here, the data is the sequence with shape batch_size * seq_len * num of tokens
        # in general that is 32 * 100 * 2
        # this batch is randomly sampled from the corpus
        # the target sequence is the same shape, and is the next step in the sequence
        # loss_real = 0.0
        # for i in range(10):
        # real_samples, real_target = self.data_fn(train_data, self.batch_size)
        emotions = real_samples[:,:,8].T[0]
        # print(real_samples.shape)
        # print(emotions)
        real_samples = real_samples.to(device)
        real_target = real_target.to(device)
        # print(real_samples[:,:,0])

        # the discrimiator
        # [:,:,0] -> notes
        # [:,:,1] -> emotion
        pred_real = self.discriminator(real_samples[:,:,0].to(device), real_samples[:,:,1].to(device), real_samples[:,:,2].to(device), real_samples[:,:,3].to(device), real_samples[:,:,4].to(device), real_samples[:,:,5].to(device), real_samples[:,:,6].to(device), real_samples[:,:,7].to(device), real_samples[:,:,8].to(device))
        
        loss_real = -torch.mean(pred_real)
        # loss_real = torch.mean(loss_real)


        loss_real.backward()
        # loss_real = self.criterion(pred_real, torch.ones(pred_real.size(0), dtype=torch.int64).to(device))

        # generated samples
        # starting with a sequence of length 4
        latent_vec = self.noise_fn(real_samples.size(1),real_samples.size(0), emotions)
        target = latent_vec[:,:,8].T[0]
        # emotion = latent_vec[:,:,8][0][0]
        loss_emotions = 0.0

        acc_emotions = 0

        predicted_vector = None
    
        # since we are not traning generator
        # we fix no gradients
        with torch.no_grad():
            # learning for 3 sequences at a time
            # this is purely due to resource constraints
            for i in range(10):

                # generating fake samples
                # print(latent_vec[:,:,8])
                fake_samples, out_emo = generator(latent_vec[:,:,0], latent_vec[:,:,1], latent_vec[:,:,2], latent_vec[:,:,3], latent_vec[:,:,4], latent_vec[:,:,5], latent_vec[:,:,6], latent_vec[:,:,7], latent_vec[:,:,8], src_mask = None)
                
                
                # for Emotions:
                # getting the weights and converting them to notes
                emo_weights = out_emo.mean(dim=0)
                loss_emotion = self.criterion(out_emo.view(out_emo.size(1), out_emo.size(2), out_emo.size(0)).cpu(), real_target[:,:,8].cpu())
                acc_emotion = self.compute_accuracy(emo_weights, target)
                loss_emotions += loss_emotion
                acc_emotions += acc_emotion

                word_tensor = []
                for k, output in enumerate(fake_samples):
                    # For Notes:
                    # getting the weights and converting them to notes
                    # print(output[-1].shape)
                    
                    output = F.log_softmax(output, dim=-1)
                    word_weights = output[-1].squeeze().exp().cpu()
                    # getting the values from the distribution from 218 (num of possible notes)
                    
                    word = torch.multinomial(word_weights, 1)
                    # batch size * 1 -> 1 * batch_size
                    # word_notes = word.view(1, word.size(0))
                    # print(word_notes.shape)
                    word_notes = word.view(word.size(0), 1)

                    word_tensor.append(word_notes.to(device))
                    # = torch.stack([word_notes, word_tensor], dim=-1)

                # emotions = torch.full((word_notes.size(0),1), emotion)
                # emotions.repeat(seq_len, 1).T.to(device)

                # stack the emotions to the final shape: seq_len * batch_size * 2 (1 for emotion and 1 for notes)
                # here seq_len = 1
                word_tensor.append(target.view(target.size(0), 1).to(device))
                word_tensor = torch.stack(word_tensor, dim=-1)                    
                    
                
                
                # concatenate vector to a fix length of seq len (here it is set as 4)
                # shape -> seq_len * batch_size * 9
                # IMP: [;,1;,:] to keep input size fixed
                latent_vec = torch.cat([latent_vec[:,1:,:], word_tensor.to(device)], dim=1)

                if i == 0:
                    notes_check = latent_vec
        # nll_loss_emotion = nll_loss_emotion / 9
        # predict on the fake samples
        # nll_loss_emotion = self.criterion(emotion_check.view(emotion_check.size(1), emotion_check.size(2), emotion_check.size(0)), real_target[:,:,8])
        pred_fake = self.discriminator(latent_vec[:,:,0].to(device), latent_vec[:,:,1].to(device), latent_vec[:,:,2].to(device), latent_vec[:,:,3].to(device), latent_vec[:,:,4].to(device), latent_vec[:,:,5].to(device), latent_vec[:,:,6].to(device), latent_vec[:,:,7].to(device), latent_vec[:,:,8].to(device))
        # loss on fake
        # loss_fake = self.criterion(pred_fake, torch.zeros((pred_fake.size(0)), dtype=torch.int64).to(device))
        # loss on emotions
        loss_emotions = loss_emotions / 10
        acc_emotions = acc_emotions / 10
        # loss_emo = criterion(out_emo.cpu(), emotion.T[:,:5].cpu())
        gp = self.calc_gradient_penalty(real_samples.to(device), notes_check.detach().to(device))

        # loss_fake = torch.mean(pred_fake, dim=1)
        # loss_fake = torch.mean(loss_fake)
        loss_fake = torch.mean(pred_fake) + gp

        loss_fake.backward()
        
        # combine
        # print(loss_fake , loss_real , gp)
        loss = loss_fake + loss_real
        # loss.backward()
        # nll_loss_emotion.retain_grad()
        # nll_loss_emotion.backward()
        # print(out_emo)
        # print(emotion[:out_emo.size(0)])
        # loss_real.backward()
        # loss_fake.backward()
        # loss_emotions.backward()
        # to avoid explosion
        # nn.utils.clip_grad_norm_(discriminator.parameters(), 3)
        self.optim_d.step()
        return loss_real.item(), loss_fake.item(), acc_emotions, loss.item()

    def train_step(self, real_samples, real_target):
        """Train both networks and return the losses."""
        loss_d = self.train_step_discriminator(real_samples, real_target)
        loss_g = self.train_step_generator(real_samples, real_target)
        # loss_g = self.train_step_generator(real_samples, real_target)
        # loss_d = self.train_step_discriminator(real_samples, real_target)
        
        return loss_g, loss_d

In [216]:
# get_batch subdivides the source data into chunks of length args.bptt.
# If source is equal to the example output of the batchify function, with
# a bptt-limit of 2, we'd get the following two Variables for i = 0:
# ┌ a g m s ┐ ┌ b h n t ┐
# └ b h n t ┘ └ c i o u ┘
# Note that despite the name of the function, the subdivison of data is not
# done along the batch dimension (i.e. dimension 1), since that was handled
# by the batchify function. The chunks are along dimension 0, corresponding
# to the seq_len dimension in the LSTM.
def get_batch(source, batch_size):
    rand_columns = torch.randperm(source.size(0))[:batch_size]
    # batch_size = min(batch_size, len(source) - 1 - i)
    data = source[rand_columns,:source.size(1)-1, :]
    target = source[rand_columns,1:source.size(1), :]
    return data, target

In [217]:
def generate_square_subsequent_mask(sz):
    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

In [218]:
ntokens

[3, 35, 85, 32, 66, 16, 7, 3, 4]

In [219]:
def noise_fn(seq_len, batch_size, emotions=None):
    notes = []
    for token in ntokens[:-1]:
        # print(token)
        notes.append(torch.randint(token, (batch_size, seq_len), dtype=torch.long).to(device))
    
    if emotions != None:
        emotions = emotions.repeat(seq_len, 1).T.to(device)
    else:
        emotion = torch.randint(0,4, (1,), dtype=torch.long)
        emotions = torch.full((batch_size, seq_len), emotion.item()).to(device)
    
    notes.append(emotions)
    return torch.stack(notes, dim=-1)
    

In [220]:
generator = Generator(ntokens, emsize, nhead, nlayer, dropout)
discriminator = Discriminator(ntokens, emsize, nhead, nhid, nlayer, dropout=0.5)

gan = MidiTransGAN(generator, discriminator, noise_fn, batch_size=batch_size, device=device)

In [221]:
def network_paras(model):
    # compute only trainable params
    param = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in param])
    return params

In [222]:
print("There are",network_paras(generator),"parameters in generator")
print("There are",network_paras(discriminator),"parameters in discriminator")

There are 5914107 parameters in generator
There are 2624001 parameters in discriminator


In [223]:
network_paras(gan) - 5876973 # parameters in EMOPIA transformer with similar hyper parameters

2661135

## Training

In [224]:
import torch
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

In [225]:
len(train_dataloader)

696

## IMP: Please make sure the training script is running

In [226]:
from time import time

gan.train()

def train():
    epochs = 120
    batches = len(train_dataloader)
    
    loss_gs, acc_gs, loss_d_reals, loss_d_fakes, acc_ds, nll_losses, emo_losses, loss_ds = [], [], [], [], [], [], [], []
    start = time()
    
    for epoch in range(epochs):
        total_loss_g, total_acc_g, total_loss_d_real, total_loss_d_fake, total_acc_d, total_nll, total_nll_emo, total_loss_d = 0, 0, 0, 0, 0, 0, 0, 0

        for bidx, (xtrain, ytrain)  in enumerate(train_dataloader): 
            xtrain.to(device)
            ytrain.to(device)

            # print(xtrain.shape)

            (loss_g, accuracy_g, nll_loss, nll_loss_emo), (loss_d_real, loss_d_fake, accuracy_d, loss_d) = gan.train_step(xtrain, ytrain)
            
            total_loss_g += loss_g
            total_loss_d_real += loss_d_real
            total_loss_d_fake += loss_d_fake
            total_acc_g += accuracy_g
            total_acc_d += accuracy_d
            total_nll += nll_loss
            total_nll_emo += nll_loss_emo
            total_loss_d += loss_d

        loss_gs.append(total_loss_g / batches)
        loss_d_reals.append(total_loss_d_real / batches)
        loss_d_fakes.append(total_loss_d_fake / batches)
        acc_gs.append(total_acc_g / batches)
        acc_ds.append(total_acc_d / batches)
        nll_losses.append(total_nll / batches)
        emo_losses.append(total_nll_emo / batches)
        loss_ds.append(total_loss_d / batches)

        writer.add_scalar("Generator Loss", loss_gs[-1], epoch)
        writer.add_scalar("Discriminator Loss (Real)", loss_d_reals[-1], epoch)
        writer.add_scalar("Discriminator Loss (Fake)", loss_d_fakes[-1], epoch)
        writer.add_scalar("Generator Accuracy", acc_gs[-1], epoch)
        writer.add_scalar("Discriminator Accuracy", acc_ds[-1], epoch)
        writer.add_scalar("NLL", nll_losses[-1], epoch)
        writer.add_scalar("NLL (Emo)", emo_losses[-1], epoch)
        writer.add_scalar("Discriminator Loss", loss_ds[-1], epoch)
        
        print(f"Epoch {epoch+1}/{epochs} ({int(time() - start)}s):"
              f" Gen Loss: {loss_gs[-1]:.3f},"
              f" Dis Loss (Real): {loss_d_reals[-1]:.3f},"
              f" Dis Loss (Fake): {loss_d_fakes[-1]:.3f}",
              f" Gen Accuracy: {acc_gs[-1]:.3f}",
              f" Dis Accuracy: {acc_ds[-1]:.3f}",
              f" NLL: {nll_losses[-1]:.3f}",
              f" NLL (Emo): {emo_losses[-1]:.3f}",
              f" Dis Loss: {loss_ds[-1]:.3f}")
train()

Epoch 1/120 (343s): Gen Loss: 0.007, Dis Loss (Real): -0.992, Dis Loss (Fake): 0.027  Gen Accuracy: 0.320  Dis Accuracy: 0.319  NLL: 1.905  NLL (Emo): 0.691  Dis Loss: -0.965
Epoch 2/120 (687s): Gen Loss: 0.000, Dis Loss (Real): -1.000, Dis Loss (Fake): 0.095  Gen Accuracy: 0.395  Dis Accuracy: 0.394  NLL: 1.546  NLL (Emo): 0.395  Dis Loss: -0.905
Epoch 3/120 (1019s): Gen Loss: 0.000, Dis Loss (Real): -1.000, Dis Loss (Fake): 0.021  Gen Accuracy: 0.345  Dis Accuracy: 0.344  NLL: 1.487  NLL (Emo): 0.382  Dis Loss: -0.979
Epoch 4/120 (1347s): Gen Loss: 0.000, Dis Loss (Real): -1.000, Dis Loss (Fake): 0.020  Gen Accuracy: 0.259  Dis Accuracy: 0.259  NLL: 1.469  NLL (Emo): 0.389  Dis Loss: -0.980
Epoch 5/120 (1681s): Gen Loss: 0.000, Dis Loss (Real): -1.000, Dis Loss (Fake): 0.020  Gen Accuracy: 0.303  Dis Accuracy: 0.303  NLL: 1.455  NLL (Emo): 0.358  Dis Loss: -0.980
Epoch 6/120 (2019s): Gen Loss: 0.000, Dis Loss (Real): -1.000, Dis Loss (Fake): 0.020  Gen Accuracy: 0.287  Dis Accuracy: 

KeyboardInterrupt: 

In [None]:
# Print model's state_dict
print("Model's state_dict:")
for param_tensor in gan.state_dict():
    print(param_tensor, "\t", gan.state_dict()[param_tensor].size())

Model's state_dict:
generator.embedding_family.weight 	 torch.Size([3, 32])
generator.embedding_bar.weight 	 torch.Size([35, 128])
generator.embedding_pitch.weight 	 torch.Size([85, 512])
generator.embedding_velocity.weight 	 torch.Size([32, 128])
generator.embedding_duration.weight 	 torch.Size([66, 256])
generator.embedding_chord.weight 	 torch.Size([16, 64])
generator.embedding_rest.weight 	 torch.Size([7, 64])
generator.embedding_tempo.weight 	 torch.Size([3, 32])
generator.embedding_emotion.weight 	 torch.Size([4, 512])
generator.in_linear.weight 	 torch.Size([256, 1728])
generator.in_linear.bias 	 torch.Size([256])
generator.pos_encoder.pe 	 torch.Size([1, 5000, 256])
generator.encoder.layers.0.self_attn.in_proj_weight 	 torch.Size([768, 256])
generator.encoder.layers.0.self_attn.in_proj_bias 	 torch.Size([768])
generator.encoder.layers.0.self_attn.out_proj.weight 	 torch.Size([256, 256])
generator.encoder.layers.0.self_attn.out_proj.bias 	 torch.Size([256])
generator.encoder.lay

In [None]:
torch.save(gan.state_dict(), './models/cp_trans_gan_wgan_gp_final_1.pt')

## Generate

In [60]:
gan = MidiTransGAN(generator, discriminator, noise_fn, get_batch, device=device)
gan.load_state_dict(torch.load('./models/cp_trans_gan_wgan_gp_final_1.pt'))
gan.eval()

MidiTransGAN(
  (generator): Generator(
    (criterion): CrossEntropyLoss()
    (embedding_family): Embedding(3, 32)
    (embedding_bar): Embedding(35, 128)
    (embedding_pitch): Embedding(85, 512)
    (embedding_velocity): Embedding(32, 128)
    (embedding_duration): Embedding(66, 256)
    (embedding_chord): Embedding(16, 64)
    (embedding_rest): Embedding(7, 64)
    (embedding_tempo): Embedding(3, 32)
    (embedding_emotion): Embedding(4, 512)
    (in_linear): Linear(in_features=1728, out_features=256, bias=True)
    (pos_encoder): PositionalEncoding(
      (dropout): Dropout(p=0.4, inplace=False)
    )
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=256, out_features=256, bias=True)
          )
          (linear1): Linear(in_features=256, out_features=2048, bias=True)
          (dropout): Dropout(p=0.4, inplace=False)
          (linear

In [61]:
# tensorboard
# https://pytorch.org/tutorials/recipes/recipes/tensorboard_with_pytorch.html?msclkid=ce0b97e5b41911ec9d2e71bb3c7d0f90

In [227]:
# !pip install muspy
import muspy

In [228]:
# TODO: fix the generate sample function to handle batch size = 1
sequences = []

for k in range(3):
    for emo in range(0,4):
        n_generate = 200
        temperature = 1
        log_interval = 4000 # interval between logs

        notes = []
        for token in ntokens[:-1]:
            # print(token)
            notes.append(torch.randint(token, (1, 2), dtype=torch.long).to(device))
        

        emotions = torch.full((1, 2), emo).to(device)
        
        notes.append(emotions)

        # stacked input
        inputs = torch.stack(notes, dim=-1)
        print(len(inputs))
            
        src_mask = generate_square_subsequent_mask(len(inputs)).to(device)

        output = gan.generate_samples(latent_vec=inputs, emotion=emo, num=n_generate, src_mask=None, display=False)

        for i in range(len(output)):
            # output[i,:,0] = torch.tensor([tokens_to_raw[0][l] for l in output[i,:,0]])
            for k in range(0,8):
                output[i,:,k] = torch.tensor([tokens_to_raw[k][l] for l in output[i,:,k]])
        # print(output)
        # if i % log_interval == 0:
        print('| Generated {} notes'.format(n_generate))
        sequences.append([output[:,2:,:-1].squeeze().cpu().tolist()])

1
| Generated 200 notes
1
| Generated 200 notes
1
| Generated 200 notes
1
| Generated 200 notes
1
| Generated 200 notes
1
| Generated 200 notes
1
| Generated 200 notes
1
| Generated 200 notes
1
| Generated 200 notes
1
| Generated 200 notes
1
| Generated 200 notes
1
| Generated 200 notes


In [233]:
q1 = [sequences[0], sequences[4], sequences[8]]
q2 = [sequences[1], sequences[5], sequences[9]]
q3 = [sequences[2], sequences[6], sequences[10]]
q4 = [sequences[3], sequences[7], sequences[11]]
collected = [q1, q2, q3, q4]

In [234]:
date = '16_04_'
pitch_ranges = []
n_pitches = []
polyphonies = []
empty_beat_rates = []

for k, sequence in enumerate(collected):
    
    i = 0
    for seq in (sequence):
        i = i + 1
        # TODO: remove this
        # seq = seq[0]

        converted_back_midi = cp_enc.tokens_to_midi(seq, get_midi_programs(midi))
        file_name = 'cp_transgan_wgan_gp_final_change_init_' + date  + str(k) + '_' + str(i) + '.mid'
        converted_back_midi.dump(file_name)
        music = muspy.read_midi(file_name)

        # music = muspy.read_midi(file_name)
        pitch_ranges.append(muspy.pitch_range(music))
        n_pitches.append(muspy.n_pitch_classes_used(music))
        polyphonies.append(muspy.polyphony(music)) # average number of pitches being played concurrently.
        empty_beat_rates.append(muspy.empty_beat_rate(music))

cp_transgan_wgan_gp_final_change_init_16_04_0_1.mid
cp_transgan_wgan_gp_final_change_init_16_04_0_2.mid
cp_transgan_wgan_gp_final_change_init_16_04_0_3.mid
cp_transgan_wgan_gp_final_change_init_16_04_1_1.mid
cp_transgan_wgan_gp_final_change_init_16_04_1_2.mid
cp_transgan_wgan_gp_final_change_init_16_04_1_3.mid
cp_transgan_wgan_gp_final_change_init_16_04_2_1.mid
cp_transgan_wgan_gp_final_change_init_16_04_2_2.mid
cp_transgan_wgan_gp_final_change_init_16_04_2_3.mid
cp_transgan_wgan_gp_final_change_init_16_04_3_1.mid
cp_transgan_wgan_gp_final_change_init_16_04_3_2.mid
cp_transgan_wgan_gp_final_change_init_16_04_3_3.mid


In [76]:
results_transgan = {'Pitch_range': pitch_ranges, 'Num_pitches': n_pitches, 'Polyphony': polyphonies, 'Empty_beat_rates': empty_beat_rates}
results_df = pd.DataFrame(results_transgan)
results_df.to_csv('cp_transgan_final.csv')

In [92]:
converted_back_midi

ticks per beat: 384
max tick: 0
tempo changes: 1
time sig: 0
key sig: 0
markers: 0
lyrics: False
instruments: 1

## Metrics

### BLEU Score

In [None]:
# smoothing_function=SmoothingFunction().method1

In [67]:
train_check = train_data[:,:,0]
train_check.shape

torch.Size([22241, 20])

In [70]:
gen_check = []
for sequence in sequences:
    # print(sequence[0])
    for i in range(0, len(sequence[0])-20, 20):
        gen_check.append(sequence[0][i:i+20])

In [71]:
torch.Tensor(gen_check).shape

torch.Size([0])

In [None]:
from nltk.translate.bleu_score import corpus_bleu

# score = corpus_bleu([train_check], [torch.Tensor(gen_check)])


0.0


### MusPy metrics

In [77]:
results_df.describe()

Unnamed: 0,Pitch_range,Num_pitches,Polyphony,Empty_beat_rates
count,9.0,9.0,9.0,9.0
mean,61.111111,4.777778,3.186148,0.0
std,16.2207,0.971825,0.495314,0.0
min,38.0,3.0,2.6,0.0
25%,46.0,4.0,2.828571,0.0
50%,63.0,5.0,3.210526,0.0
75%,74.0,5.0,3.4,0.0
max,79.0,6.0,3.978261,0.0
