# Configuration

In [1]:
from transformers.pipelines.text_generation import TextGenerationPipeline
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import pandas as pd
import numpy as np
from transformers import LongformerTokenizer, TFLongformerForMaskedLM, pipeline
from transformers.file_utils import PushToHubMixin
import json
from collections import Counter

In [2]:
# Check that we're using a GPU
print(len(tf.config.list_physical_devices("GPU")))

1


# Loading Data and Models

In [3]:
data_raw = pd.read_json('cleanedRapLyrics.json', encoding='utf-8')
data_raw['Lyrics'] = data_raw['Lyrics'].astype('unicode').astype('str')

In [6]:
tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
model = TFLongformerForMaskedLM.from_pretrained("allenai/longformer-base-4096")

All model checkpoint layers were used when initializing TFLongformerForMaskedLM.

All the layers of TFLongformerForMaskedLM were initialized from the model checkpoint at allenai/longformer-base-4096.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFLongformerForMaskedLM for predictions without further training.


# EDA/Cleaning

In [5]:
data_raw.shape

(28066, 5)

In [16]:
data_raw.head()

Unnamed: 0,SongId,Artist,Album,Song,Lyrics
0,1,Nate Dogg ft. Kurupt,G-Funk Classics Vol. 1,First We Pray,"Black people dont have no, no where to go \nyo..."
1,2,Nate Dogg ft. Daz,G-Funk Classics Vol. 1/Gang Related soundtrack,These Days,"(Chorus) x2 \nThese days, you gotta be strappe..."
2,3,Nate Dogg,G-Funk Classics Vol. 1 & 2,G-Funk,Chorus: \n\nG is for the gang of money I make ...
3,4,Nate Dogg,G-Funk Classics Vol. 1 (Ghetto Preacher),"Crazy, Dangerous",[Nate Dogg] (Nancy Fletcher) \nNow life for me...
4,5,Nate Dogg,G-Funk Classics Vol. 1,The Hardest Man in Town,"Will the hardest man please stand, the homie s..."


In [7]:
data_raw['Lyrics'] = data_raw['Lyrics'].astype('unicode').astype('object').astype('string')
data_raw = data_raw.drop(columns = ['SongId'])

Due to memory constraints, we can only train on a subset of the dataset. Let's pick some well-known rappers with good lyrics.

In [8]:
eminem = data_raw[data_raw['Artist'] == 'Eminem']
print(eminem.shape)
eminem.head(2)

(150, 4)


Unnamed: 0,Artist,Album,Song,Lyrics
845,Eminem,Infinite,Infinite,"Oh yeah, this is Eminem baby, back up in that ..."
850,Eminem,Infinite,Tonight,"[Women singing] Tonight, Tonight, Tonight, To..."


In [9]:
pac2 = data_raw[data_raw['Artist'].str.contains('2Pac')]
pac2 = pd.concat([pac2, data_raw[data_raw['Artist'].str.contains('Tupac Shakur')]])
print(pac2.shape)
pac2.head(2)

(145, 4)


Unnamed: 0,Artist,Album,Song,Lyrics
9,Nate Dogg ft. 2Pac,G Funk Classics Vol. 1,Me and My Homies,(Chorus) Me and my homies tho you know we ki...
91,"2Pac ft. Nate Dogg, YGD Tha Top Dawg",Greatest Hits,All About U,You probably crooked as the last trick Wanna ...


In [10]:
kanye = data_raw[data_raw['Artist'].str.contains('Kanye')]
print(kanye.shape)
kanye.head(2)

(98, 4)


Unnamed: 0,Artist,Album,Song,Lyrics
759,"A$AP Rocky ft. Joe Fox, Kanye West",At.Long.Last.A$AP,Jukebox Joints,"[Chorus: Joe Fox] And I'm a man of my word, t..."
1741,"Chris Brown ft. Andr_ 3000, Drake, Fabolous, K...","Deuces (Remix) 12""",Deuces (Remix),[Verse One: Drake] What you mean I ain't call...


In [11]:
icecube = data_raw[data_raw['Artist'].str.contains('N.W.A.')]
icecube = pd.concat([icecube, data_raw[data_raw['Artist'].str.contains('Ice Cube')]])
print(icecube.shape)
icecube.head(2)

(94, 4)


Unnamed: 0,Artist,Album,Song,Lyrics
10791,N.W.A. (Eazy-E),N.W.A. & The Posse,Boyz-N-The-Hood,[Eazy-E] Cruising down the street in my 6-4 ...
10792,N.W.A.,N.W.A. and The Posse,8 Ball,(Flavor Flav) Kick that shit! Kick that shit...


In [12]:
jayz = data_raw[data_raw['Artist'].str.contains('Jay-Z')]
print(jayz.shape)
jayz.head(2)

(55, 4)


Unnamed: 0,Artist,Album,Song,Lyrics
192,"Snoop Dogg ft. Jay-Z, Nate Dogg, Soopafly",Paid Tha Cost to be Da Bo$$,Lollipop,"Just Blaze! [Snoop] Ehehe, oh really? You ..."
1173,"Eminem ft. 50 Cent, Ca$his, Dr. Dre, Jay-Z, St...","Syllables 12""",Syllables,[Intro] It is not about lyrics anymore! It's...


In [13]:
good_rappers = pd.concat([pac2, kanye, icecube, jayz]).reset_index() 
# the model started overfitting to Eminem so we got rid of him
good_rappers.shape

(392, 5)

In [14]:
IDS = good_rappers

# Small vocabulary/Transformer approach (unfinished)
This will only tokenize the ```small_vocab_size``` most common words in the dataset and ignore everything else. We're also throwing away most punctuation and capitalization. (All we really care about is the words.)

In [259]:
small_vocab_size = 5000

In [272]:
words = []
bads = ''.join([chr(char) for char in range(1, 32)])
bads += '(' + ')' + '[' + ']' + '{' + '}' + '+' + '\"' + r'"' + 'w/' + '.' + ',' + '!' + '?' + '-' + ':' + '~' + '_' + '=' + '|' + '>' + '<' + ';'
for song in IDS.Lyrics:
    song = song.translate(str.maketrans('','',bads)).lower()
    words += song.split(' ') + [' ']

In [273]:
word_counter = Counter(words)
common_vocab = word_counter.most_common(n=small_vocab_size)

In [276]:
fout = open('./common_vocab.txt', 'w')
for word in common_vocab:
    fout.write(word[0])
    fout.write('\n')

The following transformer code comes from https://keras.io/examples/generative/text_generation_with_miniature_gpt/.

In [280]:
def causal_attention_mask(batch_size, n_dest, n_src, dtype):
    """
    Mask the upper half of the dot product matrix in self attention.
    This prevents flow of information from future tokens to current token.
    1's in the lower triangle, counting from the lower right corner.
    """
    i = tf.range(n_dest)[:, None]
    j = tf.range(n_src)
    m = i >= j - n_src + n_dest
    mask = tf.cast(m, dtype)
    mask = tf.reshape(mask, [1, n_dest, n_src])
    mult = tf.concat(
        [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
    )
    return tf.tile(mask, mult)


class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads, embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = causal_attention_mask(batch_size, seq_len, seq_len, tf.bool)
        attention_output = self.att(inputs, inputs, attention_mask=causal_mask)
        attention_output = self.dropout1(attention_output)
        out1 = self.layernorm1(inputs + attention_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

In [281]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [283]:
vocab_size = 6000
maxlen = 64  # Max sequence size
embed_dim = 256  # Embedding size for each token
num_heads = 2  # Number of attention heads
feed_forward_dim = 256  # Hidden layer size in feed forward network inside transformer


def build_transformer():
    inputs = layers.Input(shape=(maxlen,), dtype=tf.int32)
    embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
    x = embedding_layer(inputs)
    transformer_block = TransformerBlock(embed_dim, num_heads, feed_forward_dim)
    x = transformer_block(x)
    outputs = layers.Dense(vocab_size)(x)
    model = keras.Model(inputs=inputs, outputs=[outputs, x])
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(
        "adam", loss=[loss_fn, None],
    )  # No loss and optimization based on word embeddings from transformer block
    return model

In [None]:
batch_size = 128

# Create a dataset from text files
text_ds = tf.data.TextLineDataset(filenames)
text_ds = text_ds.shuffle(buffer_size=256)
text_ds = text_ds.batch(batch_size)

# Create a vectorization layer and adapt it to the text
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size - 1,
    output_mode="int",
    output_sequence_length=maxlen + 1,
)
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()  # To get words back from token indices


def prepare_lm_inputs_labels(text):
    """
    Shift word sequences by 1 position so that the target for position (i) is
    word at position (i+1). The model will use all words up till position (i)
    to predict the next word.
    """
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y


text_ds = text_ds.map(prepare_lm_inputs_labels)
text_ds = text_ds.prefetch(tf.data.AUTOTUNE)

# Build Longformer and Models

In [33]:
# Params
MAX_LEN = 64
LR = 1e-4
LYRICS_INPUT_LEN = 64
VOCAB_SIZE = 50265 # Do not change!
EPOCHS = 100
BATCH_SIZE = 1 # EPOCHS * BATCH_SIZE gives number of songs used in training
LRS = [2.5e-4]*EPOCHS 

In [16]:
def build_model():
    
    tokens = tf.keras.layers.Input(shape=(MAX_LEN,), name = 'tokens', dtype=tf.int32)
    attention = tf.keras.layers.Input(shape=(MAX_LEN,), name = 'attention', dtype=tf.int32)
    # Create a model using tokens and attention as layers
    x = model(tokens, attention_mask=attention)
    # Create a model using tokens and attention layers as input and 
    # previously built model as output
    model2 = tf.keras.Model(inputs=[tokens,attention], outputs=x)
    model2.compile(optimizer = tf.keras.optimizers.Adam(lr = LR),
                  loss = [tf.keras.losses.CategoricalCrossentropy()],
                  metrics = [tf.keras.metrics.CategoricalAccuracy()])
    
    return model2

In [17]:
def build_better_model():
    # Same as build_model but without attention
    tokens = tf.keras.layers.Input(shape=(MAX_LEN,), name = 'tokens', dtype=tf.int32)
    # Create a model using tokens
    x = model(tokens)
    model2 = tf.keras.Model(inputs=[tokens], outputs=x)
    model2.compile(optimizer = tf.keras.optimizers.Adam(),
                  loss = [tf.keras.losses.CategoricalCrossentropy()],
                  metrics = [tf.keras.metrics.CategoricalAccuracy()])
    
    return model2

In [18]:
# number of rows * max length of lyrics taken from each row
train_tokens = np.zeros((len(IDS),MAX_LEN), dtype='int32')
train_attention = np.zeros((len(IDS),MAX_LEN), dtype='int32')
targets = np.zeros((len(IDS),MAX_LEN, VOCAB_SIZE), dtype='int32')

In [19]:
# Create an array of token arrays from lyrics in data
def createTrainingArray(data):
  for _index, row in data.iterrows():
    tokens = tokenizer(row['Lyrics'], return_tensors="tf", 
                     is_split_into_words = True, add_prefix_space = True, 
                     max_length = MAX_LEN, padding="max_length")
    train_tokens[_index,] = tokens['input_ids'][0][0:MAX_LEN]
    train_attention[_index,] = [1]*MAX_LEN
    #tokens['attention_mask'][0][0:MAX_LEN]
    # Set targets to one-hot encoded array
    for i in range(0, MAX_LEN):
      temp = np.zeros(VOCAB_SIZE, dtype=np.int8)
      if i+1 < len(tokens['input_ids'][0][0:MAX_LEN]):
        temp[tokens['input_ids'][0][i+1]] = 1
      targets[_index][i] = temp      
    # targets[_index, ] = np.zeros((MAX_LEN, VOCAB_SIZE), dtype='int32')  

In [20]:
createTrainingArray(good_rappers)

In [21]:
def lrfn(epoch):
    return LRS[epoch]
lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose = True)

In [22]:
def checkpoint(epoch):
    if epoch % 10 == 0:
        model3.save_weights(f'./checkpoints/{epoch}epochs')
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = './checkpoints',
    save_weights_only = True,
)

In [23]:
np.random.seed(42)
# Split into train and validation
train_idx = np.random.choice(np.arange(len(IDS)),int(0.9*len(IDS)),replace=False)
valid_idx = np.setdiff1d(np.arange(len(IDS)),train_idx)
np.random.seed(None)
print('Train size',len(train_idx),', Valid size',len(valid_idx))

Train size 352 , Valid size 40


# Load Models

In [24]:
model3 = build_model()
model3.load_weights('./checkpoints/450epochs')

model4 = build_better_model()
model4.load_weights('./checkpoints/200epochsm4')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x1dd0a9c7ee0>

# Train Models

In [25]:
model3.fit(x = [train_tokens[train_idx,], train_attention[train_idx,]],
          y = targets[train_idx,],
          validation_data = ([train_tokens[valid_idx,], train_attention[valid_idx,]],
                             targets[valid_idx,]),
          callbacks = [lr_callback, checkpoint_callback],
          epochs = EPOCHS,
          batch_size = BATCH_SIZE,
          verbose = 2)


Epoch 00001: LearningRateScheduler reducing learning rate to 0.00025.
Epoch 1/100




ResourceExhaustedError:  OOM when allocating tensor with shape[24,513,512] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node functional_1/tf_longformer_for_masked_lm_1/longformer/encoder/layer_._4/attention/self/Pad (defined at C:\Users\Michael\anaconda3\envs\tf-gpu\lib\site-packages\transformers\models\longformer\modeling_tf_longformer.py:1125) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
 [Op:__inference_train_function_969603]

Errors may have originated from an input operation.
Input Source operations connected to node functional_1/tf_longformer_for_masked_lm_1/longformer/encoder/layer_._4/attention/self/Pad:
 functional_1/tf_longformer_for_masked_lm_1/longformer/encoder/layer_._4/attention/self/Const (defined at C:\Users\Michael\anaconda3\envs\tf-gpu\lib\site-packages\transformers\models\longformer\modeling_tf_longformer.py:940)	
 functional_1/tf_longformer_for_masked_lm_1/longformer/encoder/layer_._4/attention/self/einsum/Einsum (defined at C:\Users\Michael\anaconda3\envs\tf-gpu\lib\site-packages\transformers\models\longformer\modeling_tf_longformer.py:937)

Function call stack:
train_function


In [None]:
model4.fit(x = [train_tokens[train_idx,]],
          y = targets[train_idx,],
          validation_data = ([train_tokens[valid_idx,]],
                             targets[valid_idx,]),
          callbacks = [checkpoint_callback],
          epochs = EPOCHS,
          batch_size = BATCH_SIZE,
          verbose = 2)

Epoch 1/100




# Generate lyrics!

In [174]:
# DO NOT READ!!! Common curse words
expletives = ["fuck", "shit", "ass", "bitch", "whore", "slut", "nig"]
# These are tokens that we know are bad
badTokens = [1437, 50118, 22886, 35625, 524, 1185, 1009, 14783, 4771, 2409, 27741, 216, 7586]
bannedTokens = badTokens

In [175]:
def getOutputWord(logits, _tokenizer):
  # Get the words formed by the highest logit tokenizer outputs
    words = ""
    for i in range(0, len(logits)):
        words += str(_tokenizer.decode(logits[i], skip_special_tokens=True))
    
    wordList = words.split(' ')
    for i in range(0, len(wordList)):
        for expletive in expletives:
            if expletive in wordList[i].lower():
                wordList[i] = r"[EXPLETIVE]"
                break
    words = ''
    for word in wordList:
        words += word + " "
    return words

In [176]:
def generateLogitsModel3(prompt):
    p = []
    pure = tokenizer.encode(prompt)[1:-1]
    for i in range(len(prompt), 64):
        temp = []
        for j in range(0, 64):
            if(j < len(pure)):
                temp.append(pure[j])
            else:
                temp.append(50624)
        p = model3.predict([np.array([temp]), np.array([[1]*64])], batch_size=16, verbose=2).logits
        tempe = 0
        tsum = 0
        sca = 0.03
        k = 0
        max_check = -9999
        while k < 50265:
            if p[0][i][k] > max_check and k not in bannedTokens:
                max_check = p[0][i][k]
            k += 1
        k = 0
        while k < 50265:
            # print(p[0][i][k])
            if k not in bannedTokens:
                tsum += np.exp(sca*(p[0][i][k]-max_check))
            k += 1
        cutoff = np.random.rand()*tsum
        result = -1
        k = 0
        print(i)
        # print(max_check)
        # print(tsum)
        while k < 50265:
            # print(p[0][i][k])
            if k not in bannedTokens:
                tempe += np.exp(sca*(p[0][i][k]-max_check))
                if tempe >= cutoff:
                    result = k
                    break
            k += 1
        # print(tempe)
        pure.append(result)
    return pure

In [177]:
def generateLogitsModel4(prompt):
    p = []
    pure = tokenizer.encode(prompt)[1:-1]
    for i in range(len(prompt), 64):
        temp = []
        for j in range(0, 64):
            if(j < len(pure)):
                temp.append(pure[j])
            else:
                temp.append(50624)
        p = model4.predict([np.array([temp])], batch_size=16, verbose=2).logits
        tempe = 0
        tsum = 0
        sca = 0.03
        k = 0
        max_check = -9999
        while k < 50265:
            if p[0][i][k] > max_check and k not in bannedTokens:
                max_check = p[0][i][k]
            k += 1
        k = 0
        while k < 50265:
            # print(p[0][i][k])
            if k not in bannedTokens:
                tsum += np.exp(sca*(p[0][i][k]-max_check))
            k += 1
        cutoff = np.random.rand()*tsum
        result = -1
        k = 0
        print(i)
        # print(max_check)
        # print(tsum)
        while k < 50265:
            # print(p[0][i][k])
            if k not in bannedTokens:
                tempe += np.exp(sca*(p[0][i][k]-max_check))
                if tempe >= cutoff:
                    result = k
                    break
            k += 1
        # print(tempe)
        pure.append(result)
    return pure

In [178]:
m3logits = generateLogitsModel3("Ok ok ok")

1/1 - 0s
8
1/1 - 0s
9
1/1 - 0s
10
1/1 - 0s
11
1/1 - 0s
12
1/1 - 0s
13
1/1 - 0s
14
1/1 - 0s
15
1/1 - 0s
16
1/1 - 0s
17
1/1 - 0s
18
1/1 - 0s
19
1/1 - 0s
20
1/1 - 0s
21
1/1 - 0s
22
1/1 - 0s
23
1/1 - 0s
24
1/1 - 0s
25
1/1 - 0s
26
1/1 - 0s
27
1/1 - 0s
28
1/1 - 0s
29
1/1 - 0s
30
1/1 - 0s
31
1/1 - 0s
32
1/1 - 0s
33
1/1 - 0s
34
1/1 - 0s
35
1/1 - 0s
36
1/1 - 0s
37
1/1 - 0s
38
1/1 - 0s
39
1/1 - 0s
40
1/1 - 0s
41
1/1 - 0s
42
1/1 - 0s
43
1/1 - 0s
44
1/1 - 0s
45
1/1 - 0s
46
1/1 - 0s
47
1/1 - 0s
48
1/1 - 0s
49
1/1 - 0s
50
1/1 - 0s
51
1/1 - 0s
52
1/1 - 0s
53
1/1 - 0s
54
1/1 - 0s
55
1/1 - 0s
56
1/1 - 0s
57
1/1 - 0s
58
1/1 - 0s
59
1/1 - 0s
60
1/1 - 0s
61
1/1 - 0s
62
1/1 - 0s
63


In [179]:
m4logits = generateLogitsModel4("Ok")

1/1 - 0s
2
1/1 - 0s
3
1/1 - 0s
4
1/1 - 0s
5
1/1 - 0s
6
1/1 - 0s
7
1/1 - 0s
8
1/1 - 0s
9
1/1 - 0s
10
1/1 - 0s
11
1/1 - 0s
12
1/1 - 0s
13
1/1 - 0s
14
1/1 - 0s
15
1/1 - 0s
16
1/1 - 0s
17
1/1 - 0s
18
1/1 - 0s
19
1/1 - 0s
20
1/1 - 0s
21
1/1 - 0s
22
1/1 - 0s
23
1/1 - 0s
24
1/1 - 0s
25
1/1 - 0s
26
1/1 - 0s
27
1/1 - 0s
28
1/1 - 0s
29
1/1 - 0s
30
1/1 - 0s
31
1/1 - 0s
32
1/1 - 0s
33
1/1 - 0s
34
1/1 - 0s
35
1/1 - 0s
36
1/1 - 0s
37
1/1 - 0s
38
1/1 - 0s
39
1/1 - 0s
40
1/1 - 0s
41
1/1 - 0s
42
1/1 - 0s
43
1/1 - 0s
44
1/1 - 0s
45
1/1 - 0s
46
1/1 - 0s
47
1/1 - 0s
48
1/1 - 0s
49
1/1 - 0s
50
1/1 - 0s
51
1/1 - 0s
52
1/1 - 0s
53
1/1 - 0s
54
1/1 - 0s
55
1/1 - 0s
56
1/1 - 0s
57
1/1 - 0s
58
1/1 - 0s
59
1/1 - 0s
60
1/1 - 0s
61
1/1 - 0s
62
1/1 - 0s
63


In [183]:
print(getOutputWord(m3logits, tokenizer))

Ok ok ok [EXPLETIVE] like can canThe wanna like mother Eminem man get do	 got do get say got like like Eminem got can go like man like like get like got like Eminem like	 like like	 doVer get got like startKick get like never like{ like [EXPLETIVE] got got 


In [182]:
print(getOutputWord(m4logits, tokenizer))

Ok [EXPLETIVE] can get like mother can like	 can like got yeah like Eminem got can never Eminem unimaginable	 right doemate	Ver mother do love can canVer Lose go likeH can never say feel like [EXPLETIVE] can Bristol got get	 [EXPLETIVE] like can get canThat can got		Ver get like don Eminem 
