In [None]:
# This notebook is adapted from the fastai IMDb example at:
# https://github.com/fastai/fastai/blob/master/courses/dl2/imdb.ipynb
from fastai_old.text import *
import html
import spacy 

spacy.load('en')

# Initialization
Here is where the learner for training is initialized. The first cell implements the scraping of lyrics from Genius.com and the text pre-processing for use later on. The scraper takes an array of artist names, searches for those artists, and for the artists that the user confirms it downloads their entire discography. Then pre_process() separates and sorts the song components into respective folders in the "data" directory. After that all the component files are loaded and further preprocessed to into word elements.

Before using this model the 'models.tar.gz' file must be downloaded from:

https://github.com/peterspenler/Modelling-Complex-Lyric-Project/releases/tag/v1.0

and extracted into the 'data' folder in the root project directory. This archive contains the trained models necessary for this notebook

In [None]:
#These are the Genius scraping and pre-processing libraries we created
from scraper import get_lyrics
from concatenate_data import pre_proccess

artists = ["Kanye", "Jay-Z", "2pac"] #This is an array of artists to scrape
get_lyrics(artists) #This gets the lyrics and saves them in "test_data"
pre_proccess() #This splits the components from each song

In [None]:
DATA_PATH=Path('data/')
DATA_PATH.mkdir(exist_ok=True)
#! curl -O http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz 
#! tar -xzfv aclImdb_v1.tar.gz -C {DATA_PATH}

BOS = 'xbos'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag

PATH=Path('data/')

In [None]:
LM_PATH=Path('data/model_lm/')
LM_PATH.mkdir(exist_ok=True)

In [None]:
CLASSES = ['unsup']
maxn = 60000
component = 'verse' #This is the component of the song to train

def get_texts(path):
    texts = []
    for idx,label in enumerate(CLASSES):
        for fname in (path).glob('*.*'):
            if len(texts) >= maxn:
                break
            texts.append(fname.open('r', encoding='utf-8').read())
    return np.array(texts)

all_texts = get_texts(PATH/component)
len(all_texts)

In [None]:
all_texts[0]

In [None]:
trn_texts,val_texts = sklearn.model_selection.train_test_split(
    all_texts, test_size=0.1)

len(trn_texts), len(val_texts)

In [None]:
col_names = ['text']
df_trn = pd.DataFrame({'text':trn_texts}, columns=col_names)
df_val = pd.DataFrame({'text':val_texts}, columns=col_names)

df_trn['text'][0]

In [None]:
re1 = re.compile(r'  +')

def fixup(x):
    return re1.sub(' ', html.unescape(x))

def get_texts(df):
    texts = f'\n{BOS} {FLD} 1 ' + df['text'].astype(str)
    texts = list(texts.apply(fixup).values)
    tok = Tokenizer().proc_all_mp(partition_by_cores(texts))
    return tok

In [None]:
tok_trn = get_texts(df_trn)
tok_val = get_texts(df_val) 

In [None]:
LM_PATH_TMP=Path('data/model_lm/tmp/')
LM_PATH.mkdir(exist_ok=True)

In [None]:
np.save(LM_PATH_TMP/'tok_trn.npy', tok_trn)
np.save(LM_PATH_TMP/'tok_val.npy', tok_val)

In [None]:
tok_trn = np.load(LM_PATH/'tmp'/'tok_trn.npy')
tok_val = np.load(LM_PATH/'tmp'/'tok_val.npy')

In [None]:
freq = Counter(p for o in tok_trn for p in o)
freq.most_common(15)

In [None]:
max_vocab = 60000
min_freq = 2

Here is where and integer mapping is made fore each word unit. These integers are what the model is then trained on. A string mapping is also made to convert the outputted integers back into their respective words.

In [None]:
# We only pick words that are used more than once to omit obscure words which then can't be trained well
itos = [o for o,c in freq.most_common(max_vocab) if c>min_freq]
itos.insert(0, '_pad_')
itos.insert(0, '_unk_')

In [None]:
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
len(itos)

In [None]:
trn_lm = np.array([[stoi[o] for o in p] for p in tok_trn])
val_lm = np.array([[stoi[o] for o in p] for p in tok_val])

In [None]:
vs=len(itos)
vs,len(trn_lm)

At this point the pre-trained wt103 model is loaded to act as the base for the new model we're going to train

In [None]:
from pathlib import PosixPath
em_sz,nh,nl = 400,1150,3

PRE_PATH = PosixPath('data/models')
PRE_LM_PATH = PRE_PATH/'fwd_wt103.h5'

wgts = torch.load(PRE_LM_PATH, map_location=lambda storage, loc: storage)

In [None]:
enc_wgts = to_np(wgts['0.encoder.weight'])
row_m = enc_wgts.mean(0)

itos2 = pickle.load((PRE_PATH/'itos_wt103.pkl').open('rb'))
stoi2 = collections.defaultdict(lambda:-1, {v:k for k,v in enumerate(itos2)})

len(itos2)

In [None]:
oov = set(itos) - set(itos2).intersection(itos)
len(oov), list(oov)[0:10]

At this point we are loading in the pre-trained weights for the words in our lyric corpus, which also appear in the wt103 corpus

In [None]:
new_w = np.zeros((vs, em_sz), dtype=np.float32)
for i,w in enumerate(itos):                     # for word in lyrics vocab
    r = stoi2[w]                                # get the int in the pretrained vocab
    new_w[i] = enc_wgts[r] if r>=0 else row_m   # add weight if in vocab, else add mean weight

In [None]:
wgts['0.encoder.weight'] = T(new_w)
wgts['0.encoder_with_dropout.embed.weight'] = T(np.copy(new_w))
wgts['1.decoder.weight'] = T(np.copy(new_w))

Finally we actually create the learner we will be training with

In [None]:
wd=1e-7
bptt=70
bs=32
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))

In [None]:
trn_dl = LanguageModelLoader(np.concatenate(trn_lm), bs, bptt)
val_dl = LanguageModelLoader(np.concatenate(val_lm), bs, bptt)
md = LanguageModelData(PATH, 1, vs, trn_dl, val_dl, bs=bs, bptt=bptt)

In [None]:
drops = np.array([0.25, 0.1, 0.2, 0.02, 0.15])*0.7

In [None]:
learner= md.get_model(opt_fn, em_sz, nh, nl, 
    dropouti=drops[0], dropout=drops[1], wdrop=drops[2], dropoute=drops[3], dropouth=drops[4])

learner.model.load_state_dict(wgts)

Here we have two funcitons for generating text to test the progress of our model. generate_text2() always picks the next word from a distribution, but generate_text3() sometimes picks the top result to try and increase the uniformity of the output

In [None]:
def generate_text2(m, s, l=20):
    m[0].bs=1  # Set batch size to 1
    m.eval()  # Turn off dropout
    m.reset()  # Reset hidden state
    m[0].bs=bs  # Put the batch size back to what it was

    ss = s.lower().split()
    si = [stoi[w] for w in ss]
    t = torch.autograd.Variable(torch.cuda.LongTensor(np.array([si])))
    
    res,*_ = m(t)

    print(s, end=' ')
    for i in range(l):
        #n = res[-1].topk(5)[1]  # top word
        n = torch.multinomial(res[-1].exp(), 3)  # drawing from probability distribution
        n = n[1] if n.data[0]==0 else n[0]
        print(itos2[int(n)], end=' ')
        res,*_ = m(n.unsqueeze(0).unsqueeze(0))  # sometimes need an extra .unsqueeze(0)
    print('...')
    
def generate_text3(m, s, l=20):
    m[0].bs=1  # Set batch size to 1
    m.eval()  # Turn off dropout
    m.reset()  # Reset hidden state
    m[0].bs=bs  # Put the batch size back to what it was

    ss = s.lower().split()
    si = [stoi[w] for w in ss]
    t = torch.autograd.Variable(torch.cuda.LongTensor(np.array([si])))
    
    res,*_ = m(t)
    
    print(s, end=' ')
    count = 0;
    while True:
        if np.random.choice([0,1], p=[0.05,0.95]) == 0:
            n = res[-1].topk(5)[1]  # top word
        else:
            n = torch.multinomial(res[-1].exp(), 10)  # drawing from probability distribution
        n = n[1] if n.data[0]==0 else n[0]
        if itos[int(n)] == '\n' and count > l:
            print('')
            break
        if not any (x in itos[int(n)] for x in ['xbos', 'xfld']):
            print(itos[int(n)], end=' ')
        res,*_ = m(n.unsqueeze(0).unsqueeze(0))  # sometimes need an extra .unsqueeze(0)
        count += 1

In [None]:
m=learner.model

In [None]:
generate_text2(m, "The")

# Training
Here we first start with a few training epochs to see a preliminary result of training our lyric data onto the pretrained model. We first use learner.find_lr() to try and find an ideal learning rate, and then use learner.fit() to actually train the model.

In [None]:
learner.metrics = [accuracy]
learner.freeze_to(-1)

In [None]:
lr=1e-3
lrs = lr

In [None]:
learner.fit(lrs/2, 1, wds=wd, use_clr=(32,2), cycle_len=5)

In [None]:
itos_song = itos
with open(PRE_PATH/'itos_song.pkl', 'wb') as f:
    pickle.dump(itos_song, f)
learner.save('lm_last_ft')

In [None]:
learner.load('lm_last_ft')

In [None]:
m=learner.model

In [None]:
generate_text2(m, "the", l=50)

Now we'll train for another 10 epochs to see how much the model can improve

In [None]:
learner.unfreeze()
learner.lr_find(start_lr=lrs/10, end_lr=lrs*10, linear=True)

In [None]:
learner.sched.plot()

In [None]:
learner.fit(lrs, 1, wds=wd, use_clr=(20,10), cycle_len=10)

In [None]:
learner.save('lm_3epochs')
m=learner.model
generate_text2(m, "the way it is", l=150)

Now that we've seen a bit of improvement we're going to go for the real deal and train the model for 30 epochs. This will give us a final model that should produce lyrics that are clearly within the style of our genre. Performing this step with 30 epochs can start to overfit the model if the training set is not large enough so num_epochs should be adjusted depending on the training set size.

In [None]:
num_epochs = 30
learner.lr_find(start_lr=lrs/10, end_lr=lrs*10, linear=True)
learner.fit(lrs, 1, wds=wd, use_clr=(20,10), cycle_len=num_epochs)
learner.save('lm_30epochs-verse-country')

In [None]:
m=learner.model
generate_text3(m, "this day they play", l=150)

Finally we ensure that the model and word mapping are saved so that we can use them for generation later

In [None]:
learner.save('lm_30epochs-new-model')
pickle.dump(itos, open(LM_PATH/'tmp'/'itos-new-model.pkl', 'wb'))