In [16]:
import tensorflow as tf
from bs4 import BeautifulSoup
import requests
import os
import time
import re
import pandas as pd
import random


In [17]:
# req = requests.get('https://raw.githubusercontent.com/cyrilou242/RapLyrics-Back/master/datasets/rapus_generalist_lowered.txt')

# with open('lyrics/vindicator.txt', 'w') as f:
#     f.write(req.text)


In [59]:
text = open('lyrics/eminem.txt', 'rb').read().decode(encoding='utf-8')
#text = re.sub('\n', ' \n ', text)
text = re.sub('\n', ' ', text)
text = re.sub('\[.*\]', '', text)
words = [word if word == '\n' else re.sub(r'[^a-zA-Z0-9]', '', word) for word in text.split(' ')]
#words = [re.sub(r'[^a-zA-Z0-9]', '', word) for word in text.split(' ')]
#print(print(re.findall('\[.*\]', text)))
words = [word for word in words if word != '']
print(f'Length of text: {len(words)} words')

vocab = sorted(set(pd.Series(words)))
print(f'{len(vocab)} unique words')
print(words)

Length of text: 2706 words
1045 unique words
['Look', 'I', 'was', 'gonna', 'go', 'easy', 'on', 'you', 'not', 'to', 'hurt', 'your', 'feelings', 'But', 'Im', 'only', 'going', 'to', 'get', 'this', 'one', 'chance', 'Six', 'minutes', 'six', 'minutes', 'Somethings', 'wrong', 'I', 'can', 'feel', 'it', 'Six', 'minutes', 'six', 'minutes', 'Slim', 'Shady', 'youre', 'on', 'Just', 'a', 'feeling', 'Ive', 'got', 'Like', 'somethings', 'about', 'to', 'happen', 'But', 'I', 'dont', 'know', 'what', 'If', 'that', 'means', 'what', 'I', 'think', 'it', 'means', 'were', 'in', 'trouble', 'Big', 'trouble', 'And', 'if', 'he', 'is', 'as', 'bananas', 'as', 'you', 'say', 'Im', 'not', 'taking', 'any', 'chances', 'You', 'are', 'just', 'what', 'the', 'doc', 'ordered', 'Im', 'beginning', 'to', 'feel', 'like', 'a', 'Rap', 'God', 'Rap', 'God', 'All', 'my', 'people', 'from', 'the', 'front', 'to', 'the', 'back', 'nod', 'back', 'nod', 'Now', 'who', 'thinks', 'their', 'arms', 'are', 'long', 'enough', 'to', 'slap', 'box', 'sl

In [19]:
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)
    
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

def text_from_ids(ids):
    l = chars_from_ids(ids).numpy().tolist()
    l = [word.decode('utf-8') for word in l]
    return ' '.join(l)

In [20]:
all_ids = ids_from_chars(words)
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

seq_length = 25
examples_per_epoch = len(text)//(seq_length+1)
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

In [21]:
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy())

b'Look'
b'I'
b'was'
b'gonna'
b'go'
b'easy'
b'on'
b'you'
b'not'
b'to'


In [22]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [23]:
dataset = sequences.map(split_input_target)

print(dataset)

for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example))
    print("Target:", text_from_ids(target_example))

<MapDataset shapes: ((25,), (25,)), types: (tf.int64, tf.int64)>
Input : Look I was gonna go easy on you not to hurt your feelings 
 But Im only going to get this one chance 
 Six
Target: I was gonna go easy on you not to hurt your feelings 
 But Im only going to get this one chance 
 Six minutes


In [24]:
BATCH_SIZE = 128
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset shapes: ((128, 25), (128, 25)), types: (tf.int64, tf.int64)>

In [25]:
vocab_size = len(vocab)
embedding_dim = 128
rnn_units = 1024

In [26]:
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [27]:
model = MyModel(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

model.summary()

(128, 25, 15896) # (batch_size, sequence_length, vocab_size)
Model: "my_model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      multiple                  2034688   
_________________________________________________________________
gru_1 (GRU)                  multiple                  3545088   
_________________________________________________________________
dense_1 (Dense)              multiple                  16293400  
Total params: 21,873,176
Trainable params: 21,873,176
Non-trainable params: 0
_________________________________________________________________


In [28]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1)

In [29]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer='adam', loss=loss)

checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [30]:
input_example_batch[0]

<tf.Tensor: shape=(25,), dtype=int64, numpy=
array([ 3778,  7933,  7933,  8788,  5475, 13153, 15858,  5666,     1,
           1,  4434,  8816, 12720, 14595,  9432, 15143, 15474, 10017,
        9809,  7527,     1,  3778, 14974, 14546, 15349], dtype=int64)>

In [31]:
print("Input:\n", text_from_ids(input_example_batch[0]))
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices))

Input:
 So everybody everybody go berzerk shake your body 
 
 Were gonna rock this house until we knock it down 
 So turn the volume

Next Char Predictions:
 Animal 19th Wasnt doublin Afeni argue leftThe Amityville spot felch Biggie Whos fix believe mount lou amendment snoopin expert lyin misconducts petes Jumbotron bastards glove


In [32]:
EPOCHS = 30

In [33]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/30
 7/60 [==>...........................] - ETA: 1:34 - loss: 9.5231

KeyboardInterrupt: 

In [None]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [None]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [None]:
start = time.time()
states = None
next_char = tf.constant(["Hi, my name is."])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

end = time.time()


In [None]:
list = [word.numpy().tolist()[0].decode('utf-8') for word in result]
for word in list:
    if word =='\n': print()
    else: print(word, end=' ')

Hi, my name is. 
mother im in say touch in me 
rick cause 
cause off 
im jewels 
if cause 
cause 
do a club he is 
he is off 

me is on 
soft 
imma liquor thats 
in get cant feeling 
cause is 
he is off 
cause is 
he is but 
but imma line 
he thats 
thats but 
no line 
but cant bill is 
but he is 
cause mind 
he set 
cant is to two 
thats 
we is but 
but he thats 
and man thats 
we is 
but when me 
but thats 
but thats 
thats 
we is 
no spray no 
thats thats 
thats no 
thats 
but thats thats 
yeah thats 
youre is 
raw thats 
thats thats 
lookin d 
thats thats 
thats thats 
but rick thats 
thats thats 
get get had 
lets thats 
thats thats 
thats thats 
me is come 
thats thats 
thats thats 
it hard 
thats thats 
thats 
thats thats 
thats thats 
thats thats 
thats thats 
g thats no 
thats 
thats thats 
thats thats 
thats thats 
thats thats 
thats thats 
thats thats 
it not thats 
the in speak 
thats thats 
thats thats 
thats thats 
thats thats 
get dont 
hard dont 
yeah set 
thats thats 


In [None]:
start = time.time()
states = None
next_char = tf.constant(['ROMEO:', 'ROMEO:', 'ROMEO:', 'ROMEO:', 'ROMEO:'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result, '\n\n' + '_'*80)
print('\nRun time:', end - start)

In [None]:
tf.saved_model.save(one_step_model, 'one_step')
one_step_reloaded = tf.saved_model.load('one_step')

In [None]:
states = None
next_char = tf.constant(['ROMEO:'])
result = [next_char]

for n in range(100):
  next_char, states = one_step_reloaded.generate_one_step(next_char, states=states)
  result.append(next_char)

print(tf.strings.join(result)[0].numpy().decode("utf-8"))