In [1]:
import numpy as np
import tensorflow as tf
import glob
import codecs
import pickle
import time

In [3]:
# Load Corpus
# Get songs
songs = sorted(glob.glob("data_album/*.txt"))

print("Found {} songs".format(len(songs)))

Found 1 songs


In [19]:
# Read Songs
text = open('data_album/none_shall_pass_album.txt').read()

print('Text Length in number of characters: {}'. format(len(text)))

print('=====')
print('Head of the text:')
print('=====')
print(text[:1000])

Text Length in number of characters: 46920
=====
Head of the text:
=====
Hunters with their dogs and deer rifles
Thousands of them line the pavement
Like patient pupae waiting to become worms

The people are dead, but the money keeps talking keep-keeps-keep-keeps talking
The people are dead, but the money keeps talking, keep
Keeps-keep-keeps talking

(Another dark night) Teething I'm marking a beast sheep
Like I walk in front of 39 thieves in a "v"
Smores over warm helvetica brown proper
For the odd God or monster, propped to teleprompter
Wild blue yonder, blue in the face, angel
Blew into the bugles in lieu of the euthanasia
Usually the shooter community chew the corpse
But I see the wolves have already gotten to you and yours
Day of the dead, play the ledge closely
Train a barrel of monkeys to aim at the lowest bogey
Dope the gonzo I've always felt choked socially
Stole the golden fleece with the culture of total nobodies
Earth rised, divided by fighting tribes
All we do is watch 'em

In [21]:
# Characters

chars = sorted(list(set(text)))
char_size = len(chars)
print('number of characters:', char_size)
print(chars)

number of characters: 80
['\n', ' ', '!', '"', "'", '(', ')', '+', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ʼ', '‘', '’']


In [4]:
# Combine songs into a single string
corpus_raw = ""
for song in songs:
    with codecs.open(song, 'r', 'utf-8') as song_file:
        corpus_raw += song_file.read()
        
print('Corpus is {} characters long'.format(len(corpus_raw)))

Corpus is 46920 characters long


In [5]:
### Process Corpus

# create look_up_tables

def create_lookup_tables(text):
    
    vocab = set(text)
    int_to_vocab = {key: word for key, word in enumerate(vocab)}
    vocab_to_int = {word: key for key, word in enumerate(vocab)}
    return vocab_to_int, int_to_vocab

In [6]:
# tokenize punctuation

def token_lookup():
    
    return {
        '.' : '||period||',
        ',' : '||comma||',
        '"' : '||quotes||',
        ';' : '||semicolon||',
        '!' : '||exclamation-mark||',
        '?' : '||question-mark||',
        '(' : '||left-parentheses||',
        ')' : '||right-parentheses||',
        '--' : '||emm-dash||',
        '\n' : '||return||',
        '[' : '||left-square-bracket||',
        ']' : '||right-square-bracket||'
    }

In [7]:
# process and save data

token_dict = token_lookup()

for token, replacement in token_dict.items():
    corpus_raw = corpus_raw.replace(token, ' {} '.format(replacement))
    

corpus_raw = corpus_raw.lower()

corpus_raw = corpus_raw.split()

vocab_to_int, int_to_vocab = create_lookup_tables(corpus_raw)

corpus_int = [vocab_to_int[word] for word in corpus_raw]

pickle.dump((corpus_int, vocab_to_int, int_to_vocab, token_dict), open('preprocess.p', 'wb'))

In [12]:
# Batch the data

def get_batches(int_text, batch_size, seq_length):
    """
    Return batches of input and target data
    :param int_text: text with words replaced by their ids
    :param batch_size: the size that each batch of data should be
    :param seq_length: the length of each sequence
    :return: batches of data as a numpy array
    """
    words_per_batch = batch_size * seq_length
    num_batches = len(int_text)//words_per_batch
    int_text = int_text[:num_batches*words_per_batch]
    y = np.array(int_text[1:] + [int_text[0]])
    x = np.array(int_text)
    
    x_batches = np.split(x.reshape(batch_size, -1), num_batches, axis=1)
    y_batches = np.split(y.reshape(batch_size, -1), num_batches, axis=1)
    
    batch_data = list(zip(x_batches, y_batches))
    
    return np.array(batch_data)


In [13]:
# Hyperparameters
num_epochs = 10000
batch_size = 512
rnn_size = 512
num_layers = 3
keep_prob = 0.7
embed_dim = 512
seq_length = 30
learning_rate = 0.001
save_dir = './save'

In [14]:
# Build the Graph
train_graph = tf.Graph()
with train_graph.as_default():    
    
    # Initialize input placeholders
    input_text = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    
    # Calculate text attributes
    vocab_size = len(int_to_vocab)
    input_text_shape = tf.shape(input_text)
    
    # Build the RNN cell
    lstm = tf.contrib.rnn.BasicLSTMCell(num_units=rnn_size)
    drop_cell = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
    cell = tf.contrib.rnn.MultiRNNCell([drop_cell] * num_layers)
    
    # Set the initial state
    initial_state = cell.zero_state(input_text_shape[0], tf.float32)
    initial_state = tf.identity(initial_state, name='initial_state')
    
    # Create word embedding as input to RNN
    embed = tf.contrib.layers.embed_sequence(input_text, vocab_size, embed_dim)
    
    # Build RNN
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed, dtype=tf.float32)
    final_state = tf.identity(final_state, name='final_state')
    
    # Take RNN output and make logits
    logits = tf.contrib.layers.fully_connected(outputs, vocab_size, activation_fn=None)
    
    # Calculate the probability of generating each word
    probs = tf.nn.softmax(logits, name='probs')
    
    # Define loss function
    cost = tf.contrib.seq2seq.sequence_loss(
        logits,
        targets,
        tf.ones([input_text_shape[0], input_text_shape[1]])
    )
    
    # Learning rate optimizer
    optimizer = tf.train.AdamOptimizer(learning_rate)
    
    # Gradient clipping to avoid exploding gradients
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)

## Train the Network

In [15]:
pickle.dump((seq_length, save_dir), open('params.p', 'wb'))
batches = get_batches(corpus_int, batch_size, seq_length)
num_batches = len(batches)
start_time = time.time()

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch in range(num_epochs):
        state = sess.run(initial_state, {input_text: batches[0][0]})
        
        for batch_index, (x, y) in enumerate(batches):
            feed_dict = {
                input_text: x,
                targets: y,
                initial_state: state,
                lr: learning_rate
            }
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed_dict)
            
        time_elapsed = time.time() - start_time
        print('Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}   time_elapsed = {:.3f}   time_remaining = {:.0f}'.format(
            epoch + 1,
            batch_index + 1,
            len(batches),
            train_loss,
            time_elapsed,
            ((num_batches * num_epochs)/((epoch + 1) * (batch_index + 1))) * time_elapsed - time_elapsed))

        # save model every 10 epochs
        if epoch % 10 == 0:
            saver = tf.train.Saver()
            saver.save(sess, save_dir)
            print('Model Trained and Saved')

IndexError: list index out of range