## Language model for Wilhelm Meister

This model corresponds with the fixed length input. This model will also use batch gradient descent rather than stochastic.

In [2]:
import numpy as np
import tensorflow as tf
import time
np.random.seed(10)

import warnings
warnings.filterwarnings('ignore')

### Loading Data

In [4]:
X_data = np.load("../../../data/goethe/X_arr_traditional.npy")
y_data = np.load("../../../data/goethe/y_arr_traditional.npy")
X_data.shape = (1202437,15,67)
y_data.shape = (1202437,67)
print(X_data.shape)
print(y_data.shape)

(1202437, 15, 67)
(1202437, 67)


In [5]:
# creating a data set of tuples for easier shuffling
all_data = []
for i in range(0,X_data.shape[0]):
    ax = X_data[i]
    ax.shape = (1,15,67)
    ay = y_data[i]
    ay.shape = (1,67)
    all_data.append((ax,ay))

### Model Functions

In [6]:
def get_placeholders():
    X = tf.placeholder(tf.float32, shape=[None,15,67], name='X')
    y = tf.placeholder(tf.float32, shape=[None,67], name='Y')
    return X,y

In [7]:
# Cross entropy loss to compare predicted char with actual char from novel
def cost_function(logits,y):
    return tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=y,logits=logits))

In [8]:
# Unidirectional rnn
def rnn_cell(the_input):
    forward_cell = tf.nn.rnn_cell.LSTMCell(num_units=256,activation=tf.nn.tanh,reuse=tf.AUTO_REUSE,name="lstm0")
    outputs,curr_state = tf.nn.dynamic_rnn(forward_cell,inputs=the_input,dtype=tf.float32)
    flat = outputs[:,-1,:] # getting the last rnn prediction output for all inputs
    out = tf.layers.dense(flat,67,activation=None,kernel_initializer=tf.contrib.layers.xavier_initializer(seed=0))
    return out

In [9]:
# Getting testing examples
ax1 = all_data[0][0]
ax2 = all_data[1][0]
ax = np.concatenate([ax1,ax2])
print(ax.shape)
ay1 = all_data[0][1]
ay2 = all_data[1][1]
ay = np.concatenate([ay1,ay2])
print(ay.shape)

(2, 15, 67)
(2, 67)


In [10]:
# Testing the forward prop and cost function
X,y = get_placeholders()
pred = rnn_cell(X)
cost = cost_function(pred,y)

init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    out = sess.run(pred,feed_dict={X:ax,y:ay})
    acost = sess.run(cost,feed_dict={X:ax,y:ay})
    
print(acost)
print(out.shape)

4.191888
(2, 67)


In [11]:
# takes in a list of x,y pairs and returns a list of numpy vector pairs consisting of 15000 observations each
def get_input_data(all_data):
    cut_off = 15000
    i = 0
    end_len = len(all_data)
    ret_data = [] # holds the list of return tuple pairs
    while(i+15000<end_len):
        x_data_temp = np.concatenate([all_data[j][0] for j in range(i,i+cut_off)])
        y_data_temp = np.concatenate([all_data[j][1] for j in range(i,i+cut_off)])
        ret_data.append((x_data_temp,y_data_temp))
        i += cut_off
        
    return ret_data

In [12]:
test_out = get_input_data(all_data)
print(len(test_out))

80


In [13]:
def model(all_data,lr=0.001,num_epochs=1,retrain=True,print_cost=False):
    tf.reset_default_graph() # resetting graph
    tf.set_random_seed(1)
    costs = []
    
    X,y = get_placeholders()
    pred = rnn_cell(X)
    cost = cost_function(pred,y)
    
    optimizer = tf.train.AdamOptimizer(learning_rate=lr)
    grads, variables = zip(*optimizer.compute_gradients(cost))
    grads, _ = tf.clip_by_global_norm(grads, 5.0) # gradient clipping
    optimizer = optimizer.apply_gradients(zip(grads, variables))
    
    init = tf.global_variables_initializer()
    saver = tf.train.Saver() # to save/load model
    with tf.Session() as sess:
        if retrain:
            saver = tf.train.import_meta_graph("../../../data/goethe/model2/language_model.ckpt.meta")
            saver.restore(sess, "../../../data/goethe/model2/language_model.ckpt")
        else:
            sess.run(init) # only run init for new model
        
        for epoch in range(num_epochs):
            start = time.time()
            running_cost = 0
            old_running_cost = 0 # used to show total cost change per stochastic step
            np.random.shuffle(all_data) # shuffling the data each epoch
            input_data = get_input_data(all_data) # get list of numpy vector inputs to the model
            for aset in input_data:
                (ax,ay) = aset
                _,temp_cost = sess.run([optimizer,cost], feed_dict={X:ax,y:ay})
                running_cost += temp_cost
                
                if print_cost:
                    cost_change = running_cost - old_running_cost
                    old_running_cost = running_cost
                    print("Cost change:",cost_change)
                    
            costs.append(running_cost)
            print("Cost at epoch {}: {}, took: {}".format(epoch+1,running_cost,time.time()-start))
            loc = saver.save(sess, "../../../data/goethe/model2/language_model.ckpt") # save model on each epoch
            
    return costs

### Sampling using the novel

Model was trained using cloud computing.

In [14]:
# Making an encoder and decoder for all of the characters in the novel
keys = ['B', 'o', 'k', ' ', 'I', 'C', 'h', 'a', 'p', 't', 'e', 'r', '\n', 
        'T', 'H', 'E', 'P', 'L', 'A', 'Y', 'w', 's', 'l', 'i', 'n', 'b', 
        'g', 'u', ':', 'd', 'm', 'c', ',', 'f', '.', 'S', 'M', 'y', '’', 
        'N', 'v', ';', '-', 'x', 'O', 'q', '!', '“', 'W', '?', '”', 'j', 
        'z', 'V', 'J', 'G', 'D', 'F', '‘', 'K', '—', 'U', 'Q', 'R', 'X', 'Z']

encoder = {}
decoder = {}
key_no = 1 # the int representing a key
for key in keys:
    encoder[key] = key_no
    decoder[key_no] = key
    key_no += 1

In [15]:
# returns one hot encoding for a particular character
def get_one_hot_encoding(char):
    char_enc = encoder[char]
    hot_vec = np.zeros((67,1)) # vocab_size = 66 (indexed at 1 so need 67 spots)
    hot_vec[char_enc] = 1
    hot_vec = hot_vec.T # shape (1,67)
    hot_vec.shape = (1,1,67)
    return hot_vec

In [16]:
# This sample cell samples the model and takes in a previous state of the lstm
def sample_cell(the_input,a,c):
    past_state = tf.contrib.rnn.LSTMStateTuple(a,c)
    forward_cell = tf.nn.rnn_cell.LSTMCell(num_units=256,activation=tf.nn.tanh,reuse=tf.AUTO_REUSE,name="lstm0")
    outputs,curr_state = tf.nn.dynamic_rnn(forward_cell,inputs=the_input,initial_state=past_state,dtype=tf.float32)
    flat = outputs[:,-1,:]
    out = tf.layers.dense(flat,67,activation=None,kernel_initializer=tf.contrib.layers.xavier_initializer(seed=0))
    out = tf.nn.softmax(out) # softmax on the logits
    achar = tf.argmax(out,axis=-1)
    achar = tf.reshape(achar,()) # character in int representation
    return achar,curr_state # returns the curr state as well as pred

In [19]:
start_seq = "Mariane was "
ax = np.zeros((1,len(start_seq),67))
for i in range(0,len(start_seq)):
    ax[0,i,:] = get_one_hot_encoding(start_seq[i])

In [22]:
tf.reset_default_graph()
x = tf.placeholder(tf.float32, shape=[1, None, 67])
a = tf.placeholder(tf.float32, shape=[1, 256])
c = tf.placeholder(tf.float32, shape=[1, 256])
out = sample_cell(x,a,c)

# start_char = "B"
# ax = get_one_hot_encoding(start_char) # getting first character to go into model
# all_chars = start_char
all_chars = start_seq

saver = tf.train.Saver()
sess = tf.InteractiveSession()
saver = tf.train.import_meta_graph("../../../data/goethe/model2/language_model.ckpt.meta") # loading model
saver.restore(sess, "../../../data/goethe/model2/language_model.ckpt")

for i in range(0,5000): # sampling for n iterations
    if i == 0:
        (achar,past_state) = sess.run(out,feed_dict={x:ax,a:np.zeros((1,256)),c:np.zeros((1,256))})
        all_chars += decoder[achar] # adding to the string
    else:
        a_a = past_state[0]
        a_c = past_state[1]
        ax = get_one_hot_encoding(decoder[achar])
        (achar,past_state) = sess.run(out,feed_dict={x:ax,a:a_a,c:a_c})
        all_chars += decoder[achar] # adding to the string
        

INFO:tensorflow:Restoring parameters from ../../../data/goethe/model2/language_model.ckpt


In [23]:
all_chars

'Mariane was to terrof, he master ton!\n\nMuMperted, might vivaring—O”\n“Yesselted as allose time efteremazes love fiddy althligggg‘‘Y‘UFU‘UR““Qa?”, doinfuYed, who arrowes not by into allightedmems, And be-fOrTUQg‘‘UUUX“V\nO those iners, of the brow rogue was life. Chapt notX’ C—RR’Dey, which cheerflan,”—r—YPU‘Q‘CUVYBORO’ I staye it sufferent bleacle.\nAt rechts\n\nQuagubYoF, only there castings from the lapszar-off-kxcte”\n\nan accide we dearning honding let beact. Young may natt of one batmQakQ-whXU.\nCCOR\nJExTNI”\nTHE TUuTTuY?\nOUR from beint!” said Wilh, by matter? Tolorits,—wast, Conor’s cassely we did not aptochmonatincion wandly that, on groun-dis of the unhackse to him, were broid BrisQ-GRdy: “What dow brothe grenattreKKKUUGUV‘R‘UR““‘DEnome, realined at tan. It contids of her prosp-hesh: he “Ax, Howaving particrital, had all; and\nerignes, with routh haldeally forced hungul manse, “rost Fried! ‘In every Germany,” said!\nMY\nJor, not asseet, as enf!’stH I mishel, I weary to mea

In [24]:
with open("text.txt","w") as file:
    for char in all_chars:
        file.write(char)