In [82]:
import os
import numpy as np
import tensorflow as tf

file = "merged_lyrics_metalcore.txt"
file_augmented = "merged_lyrics_metalcore_augmented.txt"
lyrics = open(file, "r").read() + open(file_augmented, "r").read()

print('Length of text: {} characters'.format(len(lyrics)))

Length of text: 2033922 characters


In [83]:
chars = sorted(list(set(lyrics)))
print('{} unique characters'.format(len(chars)))

char2idx = dict((c,i) for i, c in enumerate(chars))
idx2char = np.array(chars)
print(char2idx)

50 unique characters
{'\n': 0, ' ': 1, '!': 2, '"': 3, '&': 4, "'": 5, '*': 6, ',': 7, '-': 8, '.': 9, '/': 10, '0': 11, '1': 12, '2': 13, '3': 14, '4': 15, '5': 16, '6': 17, '7': 18, '8': 19, '9': 20, ':': 21, ';': 22, '?': 23, 'a': 24, 'b': 25, 'c': 26, 'd': 27, 'e': 28, 'f': 29, 'g': 30, 'h': 31, 'i': 32, 'j': 33, 'k': 34, 'l': 35, 'm': 36, 'n': 37, 'o': 38, 'p': 39, 'q': 40, 'r': 41, 's': 42, 't': 43, 'u': 44, 'v': 45, 'w': 46, 'x': 47, 'y': 48, 'z': 49}


In [84]:
lyrics_as_int = np.array([char2idx[c] for c in lyrics])
for char in char2idx:
    print('{}: {}'.format(repr(char), char2idx[char]))

'\n': 0
' ': 1
'!': 2
'"': 3
'&': 4
"'": 5
'*': 6
',': 7
'-': 8
'.': 9
'/': 10
'0': 11
'1': 12
'2': 13
'3': 14
'4': 15
'5': 16
'6': 17
'7': 18
'8': 19
'9': 20
':': 21
';': 22
'?': 23
'a': 24
'b': 25
'c': 26
'd': 27
'e': 28
'f': 29
'g': 30
'h': 31
'i': 32
'j': 33
'k': 34
'l': 35
'm': 36
'n': 37
'o': 38
'p': 39
'q': 40
'r': 41
's': 42
't': 43
'u': 44
'v': 45
'w': 46
'x': 47
'y': 48
'z': 49


In [85]:
seq_length = 120
examples_per_epoch = len(lyrics)//seq_length

char_dataset = tf.data.Dataset.from_tensor_slices(lyrics_as_int)
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

In [86]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [87]:
for input_example, target_example in  dataset.take(1):
    print('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  "bottle up the pain\nlet it rain and wash away\nlet it carry you to a better place\ndon't forget my name\nmy mind often wonde"
Target data: "ottle up the pain\nlet it rain and wash away\nlet it carry you to a better place\ndon't forget my name\nmy mind often wonder"


In [88]:
# Batch size
BATCH_SIZE = 128

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset

<BatchDataset shapes: ((128, 120), (128, 120)), types: (tf.int64, tf.int64)>

In [89]:
vocab_size = len(chars)
embedding_dim = 256
rnn_units = 1024

def build_model(vocab_size, embedding_dim, rnn_units, batch_size, stateful=True):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        tf.keras.layers.GRU(rnn_units,
                            return_sequences=True,
                            stateful=stateful,
                            recurrent_activation="sigmoid",
                            recurrent_initializer='glorot_uniform'),
        tf.keras.layers.GRU(rnn_units,
                            return_sequences=True,
                            stateful=stateful,
                            recurrent_activation="sigmoid",
                            recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ], name="LSTM")
    return model

In [90]:
model = build_model(
  vocab_size=vocab_size,
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE,
  stateful=False,
)

model.summary()
dataset

Model: "LSTM"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (128, None, 256)          12800     
_________________________________________________________________
gru_18 (GRU)                 (128, None, 1024)         3938304   
_________________________________________________________________
gru_19 (GRU)                 (128, None, 1024)         6297600   
_________________________________________________________________
dense_9 (Dense)              (128, None, 50)           51250     
Total params: 10,299,954
Trainable params: 10,299,954
Non-trainable params: 0
_________________________________________________________________


<BatchDataset shapes: ((128, 120), (128, 120)), types: (tf.int64, tf.int64)>

In [91]:
# Comment this cell to avoid training

def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer="adam", loss=loss)
checkpoint_callbacks = [
  tf.keras.callbacks.ModelCheckpoint(
    filepath='best_weight.h5',
    save_weights_only=True,
    save_best_only=True,
    monitor="loss",
  ),
  tf.keras.callbacks.EarlyStopping(
    monitor='loss', min_delta=0, patience=0, verbose=0, mode='auto',
  ),
]


EPOCHS = 100

history = model.fit(dataset, epochs=EPOCHS, callbacks=checkpoint_callbacks)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100


In [92]:
model_predict = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model_predict.load_weights('best_weight.h5')
model_predict.build(tf.TensorShape([1, None]))
model_predict.summary()

Model: "LSTM"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (1, None, 256)            12800     
_________________________________________________________________
gru_20 (GRU)                 (1, None, 1024)           3938304   
_________________________________________________________________
gru_21 (GRU)                 (1, None, 1024)           6297600   
_________________________________________________________________
dense_10 (Dense)             (1, None, 50)             51250     
Total params: 10,299,954
Trainable params: 10,299,954
Non-trainable params: 0
_________________________________________________________________


In [93]:
def generate_text(model, start_string):
    # Number of characters to generate
    num_generate = 1000

    # Converting our start string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store our results
    text_generated = []

    # Low temperature results in more predictable text.
    # Higher temperature results in more surprising text.
    # Experiment to find the best setting.
    temperature = 1.0

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model.predict(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a categorical distribution to predict the character returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # Pass the predicted character as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [94]:
generate_text(model_predict, "i'm sorry")

"i'm sorry that i can't get out of bed\ni'm sorry that my head's always a mess\ni'll smoke until i fall in shade, a falling in a bo-blay of intent\ni am certain of this\nit will never be the same\nno, never, ah\nyou will never be the same\nno, never, ah\nyou will never be the same\nno, never, ah\nyou will never be the same\nno, never, ah\nyou will never be the guy\nand ableat to so time are too little\nto be the one remembers?\ni won't let you take my name.\n\na suffering religion, judgement as you stand up at shame?\n\ni thought i wanted legacy, i thought i wanna be\ni'm still am a messengies that everybody gets high, everybody gets low\nlife can be such overdose\n\nlease ever seet your name\ni'm the heart of your lie, we're play away\ntheir greed in a place and put our time here to reptifut when they try to have made the night my home\nso can you feel the ebbrace burn and start again?\ncan we start again?\ncan we start again?\ncan we start again?\ncan we start again?\ncan we start ag