In [1]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from matplotlib import pyplot as plt
import re

from torch.utils.tensorboard import SummaryWriter
import datetime as dt

### data

In [2]:
# this dataframe is the result of the scraping procedure and contains tokenized 
slp_df = pd.read_pickle("./dataset/slangpolska_df_C.pickle")
slp_df['changekey'] = slp_df['abc'].apply(lambda x: len(re.findall('\[?K:\w+\]?',x)) >= 1 )
slp_df = slp_df[
    (slp_df["changekey"]==False) & # a few tunes have multiple key tokens, we discard them
    (slp_df["length"]<512) # we also discard a few tunes longer than 512 tokens
               ]
slp_df

Unnamed: 0,X,R,L,M,K,abc,link,length,changekey
0,X:0,R:Slängpolska,L:1/16,M:3/4,K:Cmaj,"G 2 F 2 E 4 C 4 | F 2 E 2 D 4 G, 4 | C 2 > B, ...",http://www.folkwiki.se/Musik/251,182,False
1,X:1,R:Polska,L:1/8,M:3/4,K:Cmaj,g > f e 2 e 2 | f > e d 2 G 2 | c > B c d e f ...,http://www.folkwiki.se/Musik/251,127,False
2,X:2,R:Slängpolska,L:1/16,M:3/4,K:Cmaj,C 2 > E 2 G 2 A 2 G 2 E 2 | C 2 E 2 G 2 c 2 e ...,http://www.folkwiki.se/Musik/1972,294,False
3,X:3,R:Slängpolska,L:1/16,M:3/4,K:Cmin,"D 4 |: E F G 2 B, 4 B, 4 | C B, A, G, A, 4 G, ...",http://www.folkwiki.se/Musik/183,98,False
4,X:4,R:Slängpolska,L:1/16,M:3/4,K:Cmaj,"c 2 G 2 E G E C G, 2 A, B, | C 2 C E G 2 G B c...",http://www.folkwiki.se/Musik/2368,174,False
...,...,...,...,...,...,...,...,...,...
600,X:600,R:Slängpolska,L:1/8,M:3/4,K:Cmaj,g ^f /2 g /2 a g e c | (3 e f e d c d e | (3 f...,http://www.folkwiki.se/Musik/3137,330,False
601,X:601,R:Slängpolska,L:1/8,M:3/4,K:Cmaj,e 2 e f d f | e c c E G c | c B B d G B | B c ...,http://www.folkwiki.se/Musik/3138,311,False
602,X:602,R:Slängpolska,L:1/8,M:3/4,K:Cmaj,"C B, C D D E | E C D E F 2 | E D E C E G | A /...",http://www.folkwiki.se/Musik/Vevlireettan,126,False
603,X:603,R:Slängpolska,L:1/8,M:3/4,K:Cmaj,G E F G A F | E F G 2 F C | F E F G A F | F /2...,http://www.folkwiki.se/Musik/Vevlire2an,117,False


In [3]:
path = "./dataset/data_v2.txt"
with open(path,"r",encoding="utf-8") as f:
    print("\nOPENING: ",path)
    data = f.read()

    data = data.replace('/2<','/2 <')
    data = data.replace('/2>','/2 >')
    data = data.replace('2<','2 <')
    data = data.replace('2>','2 >')
    data = data.replace('3<','3 <')
    data = data.replace('3>','3 >')
    data = data.replace('4<','4 <')
    data = data.replace('4>','4 >')
    data = data.replace('=b=B','=b =B')
    
    TOKENS_V2 = sorted(list(set(data.split())))
    TOKENS_V2 = ['<pad>','<s>'] + TOKENS_V2 + ['</s>']
    data = data.split("\n\n")

    pieces = []
    keys = []
    meters = []
    lengths = []
    
    for d in tqdm(data[:-1]): #there's a trailing newline

        m,k,p = d.split('\n')
        l = len(p.split())
        
        pieces.append(p)
        keys.append(k)
        meters.append(m)
        lengths.append(l)
        
    df_v2 = pd.DataFrame.from_dict({
        'piece':pieces,
        'mode':keys,
        'meter':meters,
        'length':lengths
    })
    df_v2['L'] = 'L:1/8'

df_v2['tuplets'] = df_v2['piece'].apply(lambda x: len(re.findall('\([245679]',x)) > 0 )
df_v2['third'] = df_v2['piece'].apply(lambda x: len(re.findall('/3',x)) > 0 )
df_v2 = df_v2[
    (df_v2["tuplets"]==False) &
    (df_v2["third"]==False) &
    (df_v2["length"]<512)
               ].rename(columns={'meter':'M','mode':'K','piece':'abc',})

df_v2


OPENING:  ./dataset/data_v2.txt


  0%|          | 0/23635 [00:00<?, ?it/s]

Unnamed: 0,abc,K,M,length,L,tuplets,third
0,G E E E 2 D E D C | G E E E F G A B c | G E E ...,K:Cmaj,M:9/8,166,L:1/8,False,False
1,f B B c f B c c | f B B c a f e c | f B B c f ...,K:Cmin,M:4/4,103,L:1/8,False,False
3,|: c | f 3/2 a /2 c f 2 a | a 3/2 b /2 c' a 3/...,K:Cmix,M:6/8,301,L:1/8,False,False
4,f > a c f 2 a | a > b c' a > g f | e 2 g c > e...,K:Cmix,M:6/8,253,L:1/8,False,False
5,c < e G c 2 G | e > f g e d c | B < d G B 2 d ...,K:Cmaj,M:6/8,248,L:1/8,False,False
...,...,...,...,...,...,...,...
23630,G |: c 3 d 3 | e 3 e 2 d | e f g f e d | c 2 B...,K:Cmaj,M:6/8,364,L:1/8,False,False
23631,G B | c 2 c _B G C E G | F 2 A F _B F A F | G ...,K:Cmaj,M:4/4,147,L:1/8,False,False
23632,C D C C D E | F E F F G A | G 2 E C D E | D 2 ...,K:Cmaj,M:6/8,289,L:1/8,False,False
23633,|: c' 2 c' b c' b | g 2 c' c' 2 b | g e e d e ...,K:Cmin,M:6/8,166,L:1/8,False,False


In [4]:
TOKENS_SLP = set([])
_ = slp_df.abc.apply(lambda x: [TOKENS_SLP.add(tt) for tt in x.split(' ')])
TOKENS_V2 = set([])
_ = df_v2.abc.apply(lambda x: [TOKENS_V2.add(tt) for tt in x.split(' ')])

#TOKENS_SLP
TOKENS = TOKENS_SLP.union(TOKENS_V2)

for tt in ['<s>','</s>','L:1/8', 'L:1/16','M:12/8', 'M:2/4', 'M:3/2', 'M:3/4', 'M:4/4', 'M:6/8', 'M:9/8','K:Cdor', 'K:Cmaj', 'K:Cmin', 'K:Cmix','<pad>']:
    TOKENS.add(tt)
    
TOKENS.remove('')
TOKENS = sorted(list(TOKENS))
assert len(TOKENS) == 128 

### datasets

In [5]:
from torch.utils.data import Dataset
  
class TokenDataset(Dataset):

    def __init__(self, dataset, block_size, TOKENS):
        self.dataset = dataset
        data_size, vocab_size = len(self.dataset ), len(TOKENS)
        print('data has %d pieces, %d unique tokens.' % (data_size, vocab_size))
        self.stoi = { tk:i for i,tk in enumerate(TOKENS) }
        self.itos = { i:tk for i,tk in enumerate(TOKENS) }
        self.block_size = block_size
        self.vocab_size = vocab_size
        
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        chunk = self.dataset[idx:idx+1]
        # encode every token to an integer
        dix = [self.stoi[s] for s in chunk[0]]
        x = torch.tensor(dix[:-1], dtype=torch.long)
        y = torch.tensor(dix[1:], dtype=torch.long)
        return x, y
    
from sklearn.model_selection import train_test_split

def generateDatasetSplit(df,max_len=512, split=0.05):
    # max_len - 5 because we need to SOS + EOS + L,M,K headers
    df = df[df.length <= max_len-5] 
    df = '<s> ' + df['L'].map(str) + '\n' + df['M'].map(str) + '\n' + df['K'].map(str) + '\n' + df['abc'].map(str) + ' </s>'
    df = df.values.reshape(-1,1)

    #takes a numpy array as input
    def padding(array,max_len):
        array = array[0].split()
        array = np.append(array,['<pad>']*(max_len-len(array) ))
        assert len(array) == max_len
        return np.array(array)

    dataset = np.asarray([padding(x,max_len) for x in tqdm(df[:])])
    
    if split:
        return train_test_split(dataset,test_size=split,random_state=1080)
    else:
        return dataset

In [6]:
train_split,test_split = generateDatasetSplit(df_v2)
print(train_split.shape,test_split.shape)

finetune_data = generateDatasetSplit(slp_df,split=None)
print(finetune_data.shape)

  0%|          | 0/23362 [00:00<?, ?it/s]

(22193, 512) (1169, 512)


  0%|          | 0/593 [00:00<?, ?it/s]

(593, 512)


In [7]:
block_size = 512 # spatial extent of the model for its context

train_dataset = TokenDataset(train_split, block_size,TOKENS)
test_dataset = TokenDataset(test_split, block_size, TOKENS)
finetune_dataset = TokenDataset(finetune_data, block_size, TOKENS)

assert train_dataset.itos == test_dataset.itos
assert train_dataset.stoi == test_dataset.stoi
assert finetune_dataset.stoi == test_dataset.stoi
assert finetune_dataset.stoi == test_dataset.stoi

#TOKENS
df_v2.to_pickle("./dataset/df_v4.pickle")
slp_df.to_pickle("./dataset/df_v4_finetune.pickle")
np.save("./dataset/TOKENS_V4.pickle",TOKENS)

data has 22193 pieces, 128 unique tokens.
data has 1169 pieces, 128 unique tokens.
data has 593 pieces, 128 unique tokens.


In [8]:
# model spec and trainer function

# make deterministic
import random
import numpy as np
import torch

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

# set up logging
import logging
logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
)

logger = logging.getLogger(__name__)

import torch.nn as nn
from torch.nn import functional as F

class GPTConfig:
    """ base GPT config, params common to all GPT versions """
    embd_pdrop = 0.1
    resid_pdrop = 0.1
    attn_pdrop = 0.1

    def __init__(self, vocab_size, block_size, **kwargs):
        self.vocab_size = vocab_size
        self.block_size = block_size
        for k,v in kwargs.items():
            setattr(self, k, v)

class GPT1Config(GPTConfig):
    """ GPT-1 like network roughly 125M params """
    n_layer = 12
    n_head = 12
    n_embd = 768

class PositionalEmbedding(nn.Module):

    def __init__(self, d_model, max_len=256):
        super().__init__()
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model).float()
        pe.require_grad = False

        position = torch.arange(0, max_len).float().unsqueeze(1)
        div_term = (torch.arange(0, d_model, 2).float() * -(np.log(10000.0) / d_model)).exp()

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return self.pe[:, :x.size(1)]
    
class CausalSelfAttention(nn.Module):
    """
    A vanilla multi-head masked self-attention layer with a projection at the end.
    It is possible to use torch.nn.MultiheadAttention here but I am including an
    explicit implementation here to show that there is nothing too scary here.
    """

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads
        self.key = nn.Linear(config.n_embd, config.n_embd)
        self.query = nn.Linear(config.n_embd, config.n_embd)
        self.value = nn.Linear(config.n_embd, config.n_embd)
        # regularization
        self.attn_drop = nn.Dropout(config.attn_pdrop)
        self.resid_drop = nn.Dropout(config.resid_pdrop)
        # output projection
        self.proj = nn.Linear(config.n_embd, config.n_embd)
        # causal mask to ensure that attention is only applied to the left in the input sequence
        self.register_buffer("mask", torch.tril(torch.ones(config.block_size, config.block_size))
                                     .view(1, 1, config.block_size, config.block_size))
        self.n_head = config.n_head
        
        # for visualization 
        self.att_weights = None
        
    def forward(self, x, layer_past=None):
        # B is the batch size, 
        # T is the sequence length, 
        # C is the dimensionality of the embedding (n_embd).
        B, T, C = x.size()

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        k = self.key(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = self.query(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = self.value(x).view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        att = (q @ k.transpose(-2, -1)) * (1.0 / np.sqrt(k.size(-1)))
        att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        # for plotting
        self.att_weights = att
        
        # att = self.attn_drop(att)
        y = self.attn_drop(att) @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_drop(self.proj(y))
        return y

class Block(nn.Module):
    """ an unassuming Transformer block """

    def __init__(self, config):
        super().__init__()
        self.ln1 = nn.LayerNorm(config.n_embd)
        self.ln2 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.mlp = nn.Sequential(
            nn.Linear(config.n_embd, 4 * config.n_embd),
            nn.GELU(),
            nn.Linear(4 * config.n_embd, config.n_embd),
            nn.Dropout(config.resid_pdrop),
        )

    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.mlp(self.ln2(x))
        return x

class GPT(nn.Module):
    """  the full GPT language model, with a context size of block_size """

    def __init__(self, config):
        super().__init__()
        self.block_size = config.block_size
        
        # input embedding with padding
        self.tok_emb = nn.Embedding(config.vocab_size, config.n_embd, 
                                    padding_idx = finetune_dataset.stoi['<pad>'])
        #self.pos_emb = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd))
        self.pos_emb = PositionalEmbedding(config.n_embd, self.block_size)
        
        self.drop = nn.Dropout(config.embd_pdrop)
        # transformer
        self.blocks = nn.Sequential(*[Block(config) for _ in range(config.n_layer)])
        
        # decoder head
        self.ln_f = nn.LayerNorm(config.n_embd)
        self.head = nn.Linear(config.n_embd, config.vocab_size, bias=True) # bias added for comparison with folkrnn

        
        self.apply(self._init_weights)

        logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters()))

    def get_block_size(self):
        return self.block_size

    def _init_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Embedding)):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if isinstance(module, nn.Linear) and module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def configure_optimizers(self, train_config):
        """
        This long function is unfortunately doing something very simple and is being very defensive:
        We are separating out all parameters of the model into two buckets: those that will experience
        weight decay for regularization and those that won't (biases, and layernorm/embedding weights).
        We are then returning the PyTorch optimizer object.
        """

        # separate out all parameters to those that will and won't experience regularizing weight decay
        decay = set()
        no_decay = set()
        whitelist_weight_modules = (torch.nn.Linear, )
        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
        for mn, m in self.named_modules():
            for pn, p in m.named_parameters():
                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name

                if pn.endswith('bias'):
                    # all biases will not be decayed
                    no_decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
                    # weights of whitelist modules will be weight decayed
                    decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
                    # weights of blacklist modules will NOT be weight decayed
                    no_decay.add(fpn)

        # special case the position embedding parameter in the root GPT module as not decayed
        #no_decay.add('pos_emb')

        # validate that we considered every parameter
        param_dict = {pn: p for pn, p in self.named_parameters()}
        inter_params = decay & no_decay
        union_params = decay | no_decay
        assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), )
        assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \
                                                    % (str(param_dict.keys() - union_params), )

        # create the pytorch optimizer object
        optim_groups = [
            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": train_config.weight_decay},
            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
        ]
        optimizer = torch.optim.AdamW(optim_groups, lr=train_config.learning_rate, betas=train_config.betas)
        return optimizer

    def forward(self, idx, targets=None):
        b, t = idx.size()
        assert t <= self.block_size, "Cannot forward, model block size is exhausted."

        # forward the GPT model
        token_embeddings = self.tok_emb(idx) # each index maps to a (learnable) vector
        #position_embeddings = self.pos_emb[:, :t, :] # each position maps to a (learnable) vector
        position_embeddings = self.pos_emb(token_embeddings)
        x = self.drop(token_embeddings + position_embeddings)

        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.head(x)

        # if we are given some desired targets also calculate the loss
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))

        return logits, loss

"""
Simple training loop; Boilerplate that could apply to any arbitrary neural network,
so nothing in this file really has anything to do with GPT specifically.
"""

import math
 
import torch.optim as optim
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data.dataloader import DataLoader

logger = logging.getLogger(__name__)

class TrainerConfig:
    # optimization parameters
    max_epochs = 10
    batch_size = 64
    learning_rate = 3e-4
    betas = (0.9, 0.95)
    grad_norm_clip = 1.0
    weight_decay = 0.1 # only applied on matmul weights
    # learning rate decay params: linear warmup followed by cosine decay to 10% of original
    lr_decay = False
    warmup_tokens = 375e6 # these two numbers come from the GPT-3 paper, but may not be good defaults elsewhere
    final_tokens = 260e9 # (at what point we reach 10% of original LR)
    # checkpoint settings
    ckpt_path = None
    num_workers = 0 # for DataLoader

    def __init__(self, **kwargs):
        for k,v in kwargs.items():
            setattr(self, k, v)

class Trainer:

    def __init__(self, model, train_dataset, test_dataset, config, writer=None):
        self.model = model
        self.train_dataset = train_dataset
        self.test_dataset = test_dataset
        self.config = config
        self.summaryWriter = writer

        # take over whatever gpus are on the system
        self.device = 'cpu'
        if torch.cuda.is_available():
            self.device = torch.cuda.current_device()
            self.model = torch.nn.DataParallel(self.model).to(self.device)

    def save_checkpoint(self):
        # DataParallel wrappers keep raw model object in .module attribute
        raw_model = self.model.module if hasattr(self.model, "module") else self.model
        logger.info("saving %s", self.config.ckpt_path)
        torch.save(raw_model.state_dict(), self.config.ckpt_path)

    def train(self):
        model, config = self.model, self.config
        raw_model = model.module if hasattr(self.model, "module") else model
        optimizer = raw_model.configure_optimizers(config)

        def run_epoch(split):
            is_train = split == 'train'
            model.train(is_train)
            data = self.train_dataset if is_train else self.test_dataset
            loader = DataLoader(data, shuffle=True, pin_memory=True,
                                batch_size=config.batch_size,
                                num_workers=config.num_workers)

            losses = []
            pbar = tqdm(enumerate(loader), total=len(loader)) if is_train else enumerate(loader)
            for it, (x, y) in pbar:

                # place data on the correct device
                x = x.to(self.device)
                y = y.to(self.device)

                # forward the model
                with torch.set_grad_enabled(is_train):
                    logits, loss = model(x, y)
                    loss = loss.mean() # collapse all losses if they are scattered on multiple gpus
                    losses.append(loss.item())

                if is_train:
                    # backprop and update the parameters
                    model.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_norm_clip)
                    optimizer.step()

                    # decay the learning rate based on our progress
                    if config.lr_decay:
                        self.tokens += (y >= 0).sum() # number of tokens processed this step (i.e. label is not -100)
                        if self.tokens < config.warmup_tokens:
                            # linear warmup
                            lr_mult = float(self.tokens) / float(max(1, config.warmup_tokens))
                        else:
                            # cosine learning rate decay
                            progress = float(self.tokens - config.warmup_tokens) / float(max(1, config.final_tokens - config.warmup_tokens))
                            lr_mult = max(0.1, 0.5 * (1.0 + math.cos(math.pi * progress)))
                        lr = config.learning_rate * lr_mult
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr
                    else:
                        lr = config.learning_rate

                    # report progress
                    self.summaryWriter.add_scalar("batchLoss", loss.item(),(len(loader)*epoch)+it)
                     
                    pbar.set_description(f"epoch {epoch+1} iter {it}: train loss {loss.item():.5f}. lr {lr:e}")
            
            if is_train:
                #print(len(losses))
                train_loss = float(np.mean(losses))
                if self.summaryWriter:
                    self.summaryWriter.add_scalar("epochLoss", train_loss, epoch)
                logger.info("epoch train loss: %f", train_loss)
                return train_loss
            
            if not is_train:
                test_loss = float(np.mean(losses))
                if self.summaryWriter:
                    self.summaryWriter.add_scalar("valLoss", test_loss, epoch)
                logger.info("test loss: %f", test_loss)
                return test_loss

        best_loss = float('inf')
        self.tokens = 0 # counter used for learning rate decay

        for epoch in range(config.max_epochs):
            train_loss = run_epoch('train')
            

            if self.test_dataset is not None:
                test_loss = run_epoch('test')

            # supports early stopping based on the test loss, or just save always if no test set is provided
            good_model = self.test_dataset is None or test_loss < best_loss
            if self.config.ckpt_path is not None and good_model:
                if self.test_dataset is not None:
                    best_loss = test_loss
                    self.save_checkpoint()


In [9]:
import copy

#CKPT = "session_model_ckpt_60" #8l 1h layers
#CKPT_NAME = "session_model_164_ckpt_60" #8l 1h layers

def save_model(CKPT_NAME,model):
    # SAVE THE SESSION MODEL 
    # DataParallel wrappers keep raw model object in .module attribute
    raw_model = model.module if hasattr(model, "module") else model
    torch.save(raw_model.state_dict(), CKPT_NAME)
    print('Checkpoint saved!',CKPT_NAME)
    
def load_model(CKPT_NAME,model):
    ckpt_model = model.module if hasattr(model, "module") else model
    try:
        ck = torch.load(CKPT_NAME)
    except:
        return None
    ckpt_model.load_state_dict(copy.deepcopy(ck))
    model.cuda()
    print('Checkpoint loaded!',CKPT_NAME)
    return model

In [10]:
# PATHS
DF_PATH = './dataset/df_v4.pickle'
DF_FINETUNE_PATH = './dataset/df_v4_finetune.pickle'
TOKENS = np.load('./dataset/TOKENS_V4.pickle.npy',allow_pickle=True)

# DATASET
block_size = 512
train_split,test_split = generateDatasetSplit(pd.read_pickle(DF_PATH))

train_dataset = TokenDataset(train_split, block_size,TOKENS)
test_dataset = TokenDataset(test_split, block_size,TOKENS)

  0%|          | 0/23362 [00:00<?, ?it/s]

data has 22193 pieces, 128 unique tokens.
data has 1169 pieces, 128 unique tokens.


In [11]:
mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size,
                  n_layer=16, n_head=4, n_embd=128)
session_model = GPT(mconf)

11/26/2021 12:13:50 - INFO - __main__ -   number of parameters: 3.205504e+06


In [12]:
from torch.utils.tensorboard import SummaryWriter
import datetime as dt

CKPT_NAME = "./models/session_model_164_ckpt_60" 
mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size,
                  n_layer=16, n_head=4, n_embd=128)
session_model = GPT(mconf)
session_model = load_model(CKPT_NAME,session_model)

if session_model is None:
    session_model = GPT(mconf)
    tconf = TrainerConfig(max_epochs=1, batch_size=64,
                          learning_rate=5e-3,
                          lr_decay=False, #warmup_tokens=256*20, final_tokens=2*len(train_dataset)*block_size,
                          num_workers=8, 
                          #ckpt_path = './' + str(dt.datetime.now().strftime("%b%d_%H-%M-%S")) + '_ckpt'
                         )
    #writer = SummaryWriter(log_dir='./runs/slangpolska/'+str(dt.datetime.now().strftime("%b%d_%H-%M-%S")) )
    trainer = Trainer(session_model, train_dataset, test_dataset,tconf, writer=None)
    trainer.train()
    #writer.flush()
    #writer.add_text("text","pretraining slangpolska",0)
    #writer.flush()
    CKPT_NAME = "./models/session_model_164_ckpt_60" #8l 1h layers
    save_model(CKPT_NAME,session_model)

11/26/2021 12:13:51 - INFO - __main__ -   number of parameters: 3.205504e+06


Checkpoint loaded! ./models/session_model_164_ckpt_60


In [53]:
import random
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F

def top_k_logits(logits, k):
    v, ix = torch.topk(logits, k)
    out = logits.clone()
    out[out < v[:, [-1]]] =  -float('Inf')
    return out

def top_p_logits(logits, top_p,verbose=False):
    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
    cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
    # Remove tokens with cumulative probability above the threshold
    sorted_indices_to_remove = cumulative_probs > top_p
    # Shift the indices to the right to keep also the first token above the threshold
    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
    sorted_indices_to_remove[..., 0] = 0
    out = logits.clone()
    indices_to_remove = torch.zeros_like(out,dtype=torch.bool).scatter_(dim=-1, index=sorted_indices, src=sorted_indices_to_remove )
    out[indices_to_remove] = -float('Inf')#filter_value
    if verbose: 
        print('|top_p:',(out > -float('Inf')).sum().item(),'\n')
    return out

@torch.no_grad()
def sample(model, x, steps, temperature=1.0, sample=True, top_k=None, top_p=None):

    block_size = model.get_block_size()
    model.eval()
    #print(x.shape)
    l = torch.zeros((1,model.tok_emb.weight.size()[0])).to('cuda')
    p = torch.zeros((1,model.tok_emb.weight.size()[0])).to('cuda')

    for k in range(steps):
        x_cond = x if x.size(1) <= block_size else x[:, -block_size:] # crop context if needed
        
        logits, _ = model(x_cond)
        # pluck the logits at the final step and scale by temperature
        logits = logits[:, -1, :] / temperature
        
        # optionally crop probabilities to only the top k options
        if top_k is not None and top_k > 0.0:
            logits = top_k_logits(logits, top_k)
            # apply softmax to convert to probabilities
            probs = F.softmax(logits, dim=-1)
        
        # optionally crop probabilities to only the top p probability mass
        if top_p is not None and top_p > 0.0:
            logits = top_p_logits(logits, top_p)
            probs = F.softmax(logits, dim=-1)
        
        else: probs = F.softmax(logits, dim=-1)
        
        # sample from the distribution or take the most likely
        if sample:
            ix = torch.multinomial(probs, num_samples=1)
        else:
            _, ix = torch.topk(probs, k=1, dim=-1)
        
        # append to the sequence and continue
        #print(l.shape)
        l = torch.cat((l, logits), dim=0)
        p = torch.cat((p, probs), dim=0)
        x = torch.cat((x, ix), dim=1)
        if ix == train_dataset.stoi["</s>"] or ix == train_dataset.stoi["<pad>"]:
            #print('end token reached')
            break

    return x,l, p

@torch.no_grad() 
def recursiveBeam(model,context_tokens, context_logprob, depth, P1=.99, K1=2, T1=1.2, verbose=True,):
    model.eval()
    if verbose: print('\nrecursiveBeam depth:',depth,'\ninput',context_tokens)

    logits, _ = model(context_tokens)
    logits = logits[:, -1, :] / T1 # logits for last step
    logits = top_p_logits(logits, P1)
    probs = F.softmax(logits, dim=-1).detach()
    candidate_tokens = torch.where(probs > 0.0)[1].reshape(1,-1) 
    candidate_logprobs = torch.log(probs[0,candidate_tokens])
    
    if candidate_tokens.shape[1] > K1:
        if verbose: print('too many',candidate_tokens.shape[1])
        candidate_logprobs, candidate_tokens = torch.topk(torch.log(probs), K1)    
    
    if verbose: print('tok', candidate_tokens, '\nprobs',candidate_logprobs)
    if verbose: print("branches:",candidate_tokens.shape[1])
    # return datastructure
    ret_T = []
    ret_P = []
    
    # iterate over the candidate tokens
    for i in range(0,candidate_tokens.shape[1]):
        t = torch.cat([context_tokens , candidate_tokens[0,i].reshape(1,1)],dim=1)
        p = context_logprob + candidate_logprobs[0,i].reshape(1,1)
        if verbose: print(i,'\tt', t,'\tp',p)
        # base case append the results
        if depth==0:
            ret_T.append(t)
            ret_P.append(p)     
        # recursive call, then append the results
        # complete tokens sequences are propagated from the bottom back
        else:
            rt,rp = recursiveBeam(model,t, p, depth-1, 
                                  K1=K1, P1=P1, T1=T1,
                                  verbose=verbose)
            ret_T.append(rt)
            ret_P.append(rp)
            
    RT,RP = torch.cat(ret_T,dim=0), torch.cat(ret_P,dim=0)     
    return RT,RP

In [54]:
def simple_sample(model, SEED, verbose=False,print_score=False):
    if verbose: print('SEED:',SEED)
    set_seed(SEED)

    prompt = "<s> L:1/16 M:3/4 " + np.random.choice(['K:Cmaj','K:Cmin'])
    x = torch.tensor([train_dataset.stoi[s] for s in prompt.split()], dtype=torch.long)[None,...].to("cuda")
    y,l,p = sample(model, x, 512, temperature=1.0 , top_p=0.99, sample=True)
    if verbose: print(y)
    completion = ''.join([train_dataset.itos[int(i)] for i in y[0]])

    end = completion.find('</s>')
    if verbose: print(completion)

    if end == -1:
        if verbose: print("not finished")
        return None
    else:
        abc = completion[3:end]
        abc = abc[:6] + "\n" + abc[6:]
        abc = abc[:12] + "\n" + abc[12:]
        abc = abc[:19] + "\n" + abc[19:]

        abc = 'X:'+str(SEED)+'\nT:'+str(SEED)+'\nQ:1/4=100\n' + abc

        if verbose: print(abc,"\n")
        if print_score:
            #replacement for javascript
            abc_score = abc.replace('\n','\\n')
            abc_score = abc_score.replace(':|' ,':|\\n')
            abc_score = abc_score.replace(':||:',':|\\n|:')
            abcPlayer(abc_score)#'\"'+abc+'\"')
        return abc

In [57]:
## generate some sample with beamsearch
def beamsearch_sample(model,SEED,verbose=False,print_score=False):
    if verbose: print('SEED:',SEED)
    set_seed(SEED)

    prompt = "<s> L:1/16 M:3/4 " + np.random.choice(['K:Cmaj','K:Cmin'])
    x = torch.tensor([train_dataset.stoi[s] for s in prompt.split()], dtype=torch.long)[None,...].to('cuda')
    # print('prompt:',x)
    D = 3
    K1 = 2
    P1=.99
    T1 = 1.15 
    T3 = 1.15 
    P3 = 0.99 
    
    if verbose: pbar = tqdm(initial=x.shape[1],total=512)

    while x.shape[1] < 512-(D+1):
        if (x[0,-1] == train_dataset.stoi['</s>']) | (x[0,-1] == train_dataset.stoi['<pad>']):break
        if verbose: pbar.update(D+1)
        T,P = recursiveBeam( model, x, torch.FloatTensor([0.0]).to("cuda") , K1=K1, P1=P1, T1=T1, depth=D,verbose=False)
        P = P.reshape(1,-1) / T3   
        P = top_p_logits(P, P3,verbose=False)
        P = F.softmax(P,dim=1)
        ix2 = torch.multinomial(P,num_samples=1)
        x = T[ix2].reshape(1,-1)

    if verbose: pbar.close()
    y = x.detach().cpu().numpy()
    if verbose: print(y)
        
    # avoid hanged repetitions
    for i in reversed(np.where(y[0]==18)[0][:-1]):
        if y[0,i+1] != 127: 
            y = np.insert(y,i+1,127,axis=1)

    try:
        end = 1 + np.where( (y==train_dataset.stoi['</s>']) | (y==train_dataset.stoi['<pad>']) )[1][0] 
    except IndexError:
        if verbose: print('no end')
        return None

    #format string
    abc = ''.join([train_dataset.itos[int(i)] for i in y[0][1:end-1]])
    abc = abc[0:6]+'\n' + abc[6:11]+'\n'+abc[11:17]+'\n'+abc[17:]
    abc = 'X:'+str(SEED)+'\nT:'+str(SEED)+'\nC:tradformer-ft-depthbeam\nQ:1/4=100\n'+abc
    if verbose: print(abc)
        
    if print_score:
        if verbose: 
            print(abc,"\n")
        abc = abc.replace(':||:',':|\n|:')
        abcPlayer(abc,H=450)
    return abc

In [46]:
finetune_data = generateDatasetSplit(pd.read_pickle(DF_FINETUNE_PATH),split=None)
finetune_dataset = TokenDataset(finetune_data, block_size, TOKENS)

  0%|          | 0/593 [00:00<?, ?it/s]

data has 593 pieces, 128 unique tokens.


In [27]:
CKPT_NAME = "session_model_164_ckpt_60" #8l 1h layers
mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size,
                  n_layer=16, n_head=4, n_embd=128)
slp_model = GPT(mconf)
slp_model = load_model(CKPT_NAME,slp_model)

11/26/2021 14:23:39 - INFO - __main__ -   number of parameters: 3.205504e+06


In [None]:
tconf = TrainerConfig(max_epochs=30, batch_size=64,
                      learning_rate=5e-4,
                      lr_decay=False, #warmup_tokens=256*20, final_tokens=2*len(train_dataset)*block_size,
                      num_workers=6, 
                      #ckpt_path = './' + str(dt.datetime.now().strftime("%b%d_%H-%M-%S")) + '_ckpt'
                     )
#writer = SummaryWriter(log_dir='./runs/slangpolska/')
trainer = Trainer(slp_model, finetune_dataset, None,tconf, writer=None)
trainer.train()
#writer.flush()

# writer.add_text("text","finetuning slangpolska",0)
# writer.flush()

CKPT_NAME = "./models/slp_model_164_ckpt_30" #8l 1h layers
save_model(CKPT_NAME,slp_model)

In [30]:
CKPT_NAME = "./models/slp_model_164_ckpt_30" #8l 1h layers
mconf = GPTConfig(train_dataset.vocab_size, train_dataset.block_size,
                  n_layer=16, n_head=4, n_embd=128)
slp_model = GPT(mconf)
slp_model = load_model(CKPT_NAME,slp_model)

11/26/2021 14:25:25 - INFO - __main__ -   number of parameters: 3.205504e+06


Checkpoint loaded! ./models/slp_model_164_ckpt_30


In [43]:
# _ = simple_sample(slp_model,123,verbose=True,print_score=False)

In [58]:
_ = beamsearch_sample(slp_model,123,verbose=True,print_score=False)

SEED: 123


  1%|          | 4/512 [00:00<?, ?it/s]

[[ 22  65  70  62 113   6 113  49 113   6 113  49 113   6 115 117 124 119
  117 119 115 117   6 117 113 115   6 115  49 124 113  49 113 117 115 113
   49  47  59   6  57   6 124  55   6  53   6  51   6  50   6  48   6  60
    6 124  51   6  51  50  51   6  51  50  51   6  53  55 124  57  55  57
   53  55   6 117 113 115   6 115  49 124 113  49 113 117 115 113  49  47
   59   6  57   6 124  55   6  53   6  51   6  50   6  51  10  18 127  59
   47  49 113 115 113  49 113 117 113  49 113 124 115 113  49  47  59  47
   49 113 115   6 115  49 124 113  49 113 117 115 113  49  47  59   6  57
    6 124  55   6  53   6  51   6  50   6  48   6  60   6 124  59  47  49
  113 115 113  49 113 117 113  49 113 124 115 113  49  47  59  47  49 113
  115   6 115  49 124 113  49 113 117 115 113  49  47  59   6  57   6 124
   55   6  53   6  51   6  50   6  51  10  18  20  21  21]]
X:123
T:123
C:tradformer-ft-depthbeam
Q:1/4=100
L:1/16
M:3/4
K:Cmaj
c2cBc2cBc2de|fefde2ecd2dB|cBcedcBAG2F2|E2D2C2B,2A,2G,2|C2C

In [33]:
def max_repeating_bars(abc):
    bars = re.findall('[^||\|:]+[\||(:\|)]',abc[70:])
    return max([len(re.findall(re.escape(bar),abc)) for bar in bars])

# o = beamsearch_sample(1038,verbose=True,print_score=True)
# max_repeating_bars(o)

NOTE_REGEX = "[\^\_\=]?[ABCDEFG][,]?|[\^\_\=]?[abcdefg][']?|z"
NOTE_NUMBER_REGEX = "({NOTE})(?=\d\d?)".format(NOTE=NOTE_REGEX)#"(?<={NOTE})[\d][\d]?(?={NOTE})".format(NOTE=NOTE_REGEX)
#CHORD_NUMBER_REGEX = "[({NOTE})(?=\d\d?)".format(NOTE=NOTE_REGEX)
#print(NOTE_NUMBER_REGEX)
def chord_number_sub(matchobj):
    #print(matchobj)
    s = re.findall('\d',matchobj.group())[0]
   # print(s)
    return s
    
def couple_repeating_bars(abc):
    bars = re.findall('[^||\|:]+\|[^||\|:]+[\||(:\|)]',abc[70:])
    #rb = sorted([len(re.findall(re.escape(bar),abc)) for bar in bars],reverse=True)
    return max([len(re.findall(re.escape(bar),abc)) for bar in bars])
    
def check_bars(abc,verbose=False):
    bars = re.findall('[^||\|:|\|1]+[\||(:\|)]',abc[70:])
    for b in bars:
        try:
            b = b.strip(':|')
            if verbose: print(b)
            b = re.sub("([\^\_\=]?[ABCDEFG][,]?|[\^\_\=]?[abcdefg][']?)2\>([\^\_\=]?[ABCDEFG][,]?|[\^\_\=]?[abcdefg][']?)2",'4',b)
            b = re.sub(NOTE_NUMBER_REGEX,' ',b)
            b = re.sub(NOTE_REGEX,' 1 ',b) # notes regex
            b = re.sub('\[[\s\d]+\]',chord_number_sub,b)
            if verbose: print(b)
            s = sum([int(n) for n in b.split()])
            if s != 12: return 0
        except ValueError as e:
            print(e)
            return 2
    return 1

# prova = output_df.loc[13].abc
# print(prova)
# check_bars(prova,verbose=True)

In [None]:
# append new outputs
with open("./outputs/slangpolska_1000_beam_ht.txt","w") as o:
    LEN = 103 # mrp <= 4
    c = 1+1324 # starting from
    tot = 0
    pbar = tqdm(initial=0,total=LEN)
    while tot < LEN:
        abc = beamsearch_sample(c)

        if abc != None:
            # check goodness
            mrp = max_repeating_bars(abc)
            if  mrp > 4:
                print(c,"too many repetitions")
                c += 1
                continue
            # check goodness
            crp = couple_repeating_bars(abc)
            if crp > 4:
                print(c,"too many couple repetitions")
                c += 1
                continue
            # check goodness  
            if check_bars(abc) != 1:
                print(c,"uneven bars")
                c += 1
                continue
            else:
                o.write(abc+"\n\n")
                pbar.update(1)
                c += 1
                tot += 1
        else:
            print(c,"no end")
            c += 1
    pbar.close()