# Generate Lyrics

Generating song lyrics


dataset:
https://www.kaggle.com/mousehead/songlyrics

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

print("Tensorflow Version: ",tf.VERSION)

Tensorflow Version:  1.10.0


In [2]:
#Location of songdata.csv from kaggle
Filename = "songdata.csv"
#Read a few lines to see what it looks like
songdata = pd.read_csv(Filename,nrows=10)
songdata.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [3]:
#I only need the 'text' column
songdata = pd.read_csv(Filename,usecols=[3])

In [4]:
songdata.head()

Unnamed: 0,text
0,"Look at her face, it's a wonderful face \nAnd..."
1,"Take it easy with me, please \nTouch me gentl..."
2,I'll never know why I had to go \nWhy I had t...
3,Making somebody happy is a question of give an...
4,Making somebody happy is a question of give an...


In [182]:
print(songdata['text'][0])

Look at her face, it's a wonderful face  
And it means something special to me  
Look at the way that she smiles when she sees me  
How lucky can one fellow be?  
  
She's just my kind of girl, she makes me feel fine  
Who could ever believe that she could be mine?  
She's just my kind of girl, without her I'm blue  
And if she ever leaves me what could I do, what could I do?  
  
And when we go for a walk in the park  
And she holds me and squeezes my hand  
We'll go on walking for hours and talking  
About all the things that we plan  
  
She's just my kind of girl, she makes me feel fine  
Who could ever believe that she could be mine?  
She's just my kind of girl, without her I'm blue  
And if she ever leaves me what could I do, what could I do?




## Remove all not text characters

In [191]:
import re
from collections import Counter

def clean(text):
    text = re.sub(r'[^a-zA-Z ]','',text)
    return text
    
wordCounts = Counter(songdata['text'].str.lower().map(lambda x: clean(x)).str.split(expand=True).stack())
print(len(wordCounts))

101826


In [194]:
wordCounts.most_common(10)

[('the', 497233),
 ('i', 426534),
 ('you', 425002),
 ('to', 296485),
 ('and', 294139),
 ('a', 255365),
 ('me', 201836),
 ('my', 170529),
 ('in', 167443),
 ('it', 149029)]

## Building the lookup dictionaries

In [193]:

TKN_SONG_END = "||SONG_END||"
TKN_MISSING = "||MISSING||"
tokens = [
    TKN_SONG_END,
    TKN_MISSING
]
wordSet = set(dict(wordCounts.most_common(30000)).keys()).union(tokens)

vocab_to_int = {c:i for i,c in enumerate(wordSet)}
int_to_vocab = dict(enumerate(wordSet))
print("Vocab size: {}".format(len(int_to_vocab)))


Vocab size: 30002


## Prepare the data

In [205]:
%%time
        
DataList = []
for index, row in songdata.iterrows():
    R = [vocab_to_int[x] if x in vocab_to_int else vocab_to_int[TKN_MISSING] for x in clean(row['text'].lower()).split()]
    R.append(vocab_to_int[TKN_SONG_END])
    DataList += R


CPU times: user 8.52 s, sys: 93.5 ms, total: 8.62 s
Wall time: 8.62 s


## Build the RNN

In [196]:
def get_inputs():    
    inputs = tf.placeholder(tf.int32,[None,None],name="input")
    targets = tf.placeholder(tf.int32,[None,None],name="targets")
    learning_rate = tf.placeholder(tf.float32,name="learning_rate")
    return inputs, targets, learning_rate

In [197]:
def get_init_cell(batch_size, rnn_size):    
    lstm = tf.contrib.rnn.BasicLSTMCell(512)
    cell = tf.contrib.rnn.MultiRNNCell([lstm] * rnn_size)
    initial_state = tf.identity(cell.zero_state(batch_size, tf.float32), "initial_state")
    return cell, initial_state

In [198]:
def get_embed(input_data, vocab_size, embed_dim):
    embedding = tf.Variable(tf.random_uniform((vocab_size, embed_dim), -1, 1))
    embed = tf.nn.embedding_lookup(embedding, input_data) 

    return embed

In [199]:
def build_rnn(cell, inputs):
    outputs, final_state = tf.nn.dynamic_rnn(cell,inputs,dtype=tf.float32)
    return outputs, tf.identity(final_state,"final_state")

In [200]:
def build_nn(cell, rnn_size, input_data, vocab_size, embed_dim):
    embed = get_embed(input_data, vocab_size, embed_dim)
    rnn, final_state = build_rnn(cell, embed)
  
    out = tf.contrib.layers.fully_connected(rnn,vocab_size,activation_fn=None,
        weights_initializer = tf.truncated_normal_initializer(stddev = 0.05),
        biases_initializer = tf.zeros_initializer())
    
    return out, final_state

In [201]:

def get_batches(int_text, batch_size, seq_length):
    cpb = batch_size*seq_length
    nb = len(int_text)//cpb
    s = nb*seq_length
    b = []
    for n in range(0,nb):
        x = [int_text[(n*seq_length)+(i*s):(n*seq_length)+(i*s)+seq_length] for i in range(0,batch_size)]
        y = [int_text[(n*seq_length)+(i*s)+1:(n*seq_length)+(i*s)+seq_length+1] for i in range(0,batch_size)]
        b.append([x,y])
    b[-1][1][-1][-1]=b[0][0][0][0]
    return np.array(b)

In [None]:
# Number of Epochs
num_epochs = 50
# Batch Size
batch_size = 32
# RNN Size
rnn_size = 2
# Embedding Dimension Size
embed_dim = 512
# Sequence Length
seq_length = 64
# Learning Rate
learning_rate = 0.01
# Show stats for every n number of batches
show_every_n_batches = 1

save_dir = './save'

In [203]:
from tensorflow.contrib import seq2seq

train_graph = tf.Graph()
with train_graph.as_default():
    vocab_size = len(int_to_vocab)

    input_text, targets, lr = get_inputs()
    input_data_shape = tf.shape(input_text)
    cell, initial_state = get_init_cell(input_data_shape[0], rnn_size)
    logits, final_state = build_nn(cell, rnn_size, input_text, vocab_size, embed_dim)

    # Probabilities for generating words
    probs = tf.nn.softmax(logits, name='probs')

    # Loss function
    cost = seq2seq.sequence_loss(
        logits,
        targets,
        tf.ones([input_data_shape[0], input_data_shape[1]]))

    # Optimizer
    optimizer = tf.train.AdamOptimizer(lr)

    # Gradient Clipping
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)

cell <tensorflow.python.ops.rnn_cell_impl.MultiRNNCell object at 0x138354d68>
inputs Tensor("embedding_lookup:0", shape=(?, ?, 512), dtype=float32)
outputs Tensor("rnn/transpose_1:0", shape=(?, ?, 512), dtype=float32)
final_state (LSTMStateTuple(c=<tf.Tensor 'rnn/while/Exit_3:0' shape=(?, 512) dtype=float32>, h=<tf.Tensor 'rnn/while/Exit_4:0' shape=(?, 512) dtype=float32>), LSTMStateTuple(c=<tf.Tensor 'rnn/while/Exit_5:0' shape=(?, 512) dtype=float32>, h=<tf.Tensor 'rnn/while/Exit_6:0' shape=(?, 512) dtype=float32>))


In [None]:
"""
DON'T MODIFY ANYTHING IN THIS CELL
"""
batches = get_batches(DataList, batch_size, seq_length)

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())

    for epoch_i in range(num_epochs):
        state = sess.run(initial_state, {input_text: batches[0][0]})

        for batch_i, (x, y) in enumerate(batches):
            feed = {
                input_text: x,
                targets: y,
                initial_state: state,
                lr: learning_rate}
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed)

            # Show every <show_every_n_batches> batches
            if (epoch_i * len(batches) + batch_i) % show_every_n_batches == 0:
                print('Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}'.format(
                    epoch_i,
                    batch_i,
                    len(batches),
                    train_loss))

    # Save Model
    saver = tf.train.Saver()
    saver.save(sess, save_dir)
    print('Model Trained and Saved')

Epoch   0 Batch    0/1547   train_loss = 10.310
