# The Author

In [33]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import os
import numpy as np
import random
import string
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve

import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

In [34]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified %s' % filename)
    else:
        print(statinfo.st_size)
        raise Exception(
            'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename

filename = maybe_download('./drive/MyDrive/Dataset/text8.zip', 31344016)

Found and verified ./drive/MyDrive/Dataset/text8.zip


In [35]:
def clean_character(char):
    if char in string.ascii_lowercase:
        return char
    elif char in string.ascii_uppercase:
        return char.lower()
    elif char == ' ':
        return char
    return ''

def read_data(filename):
    if os.path.splitext(filename)[1] == '.zip':
        with zipfile.ZipFile(filename) as f:
            name = f.namelist()[0]
            data = tf.compat.as_str(f.read(name))
    elif os.path.splitext(filename)[1] == '.txt':
        with open(filename, 'r') as f:
            data = f.readlines()
            data = ''.join(data)
    return data
  
text8 = read_data(filename)
text8 = ''.join([clean_character(char) for char in text8])

hp_path = "./drive/MyDrive/Dataset/HarryPotter/"
harry_potter = ''.join([read_data(filename) 
                        for filename in [os.path.join(hp_path, path) 
                                         for path in os.listdir(hp_path)]])
harry_potter = ''.join([clean_character(char) for char in harry_potter])
print('Data size Text8 %d' % len(text8))
print('Data size Harry Potter %d' % len(harry_potter))

Data size Text8 100000000
Data size Harry Potter 5893451


In [36]:
vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' '
first_letter = ord(string.ascii_lowercase[0])

def char2id(char):
    if char in string.ascii_lowercase:
        return ord(char) - first_letter + 1
    elif char == ' ':
        return 0
    else:
        print('Unexpected character: %s' % char)
        return 0
    
def id2char(dictid):
    if dictid > 0:
        return chr(dictid + first_letter - 1)
    else:
        return ' '

print(char2id('a'), char2id('z'), char2id(' '), char2id('ï'))
print(id2char(1), id2char(26), id2char(0))

Unexpected character: ï
1 26 0 0
a z  


In [37]:
valid_size = 1000
train_text = text8
tune_text = harry_potter[:valid_size]
valid_text = harry_potter[valid_size:]
train_size = len(train_text)
tune_size = len(tune_text)
print(train_size + tune_size, train_text[:64])
print(valid_size, valid_text[:64])

100001000  anarchism originated as a term of abuse first used against earl
1000 nd the sorcerers stone  by jk rowlingp cmsummary rescued from th


In [38]:
batch_size = 64
num_unrollings = 10

class BatchGenerator(object):
    def __init__(self, text, batch_size, num_unrollings):
        self._text = text
        self._text_size = len(text)
        self._batch_size = batch_size
        self._num_unrollings = num_unrollings
        segment = self._text_size // batch_size
        self._cursor = [offset * segment for offset in range(batch_size)]
        self._last_batch = self._next_batch()
        
    def _next_batch(self):
        """Generate a single batch from the current cursor position in the data."""
        batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
        for b in range(self._batch_size):
            batch[b, char2id(self._text[self._cursor[b]])] = 1.0
            self._cursor[b] = (self._cursor[b] + 1) % self._text_size
        return batch
    
    def next(self):
        """
            Generate the next array of batches from the data. The array consists of
            the last batch of the previous array, followed by num_unrollings new ones.
        """
        batches = [self._last_batch]
        for step in range(self._num_unrollings):
            batches.append(self._next_batch())
        self._last_batch = batches[-1]
        return batches
    
def characters(probabilities):
    """
        Turn a 1-hot encoding or a probability distribution over the possible characters
        back into its (most likely) character representation.
    """
    return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
    """
        Convert a sequence of batches back into their (most likely) string representation
    """
    s = [''] * batches[0].shape[0]
    for b in batches:
        s = [''.join(x) for x in zip(s, characters(b))]
    return s

train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
tune_batches = BatchGenerator(tune_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 1)

print(batches2string(train_batches.next()))
print(batches2string(train_batches.next()))
print(batches2string(tune_batches.next()))
print(batches2string(tune_batches.next()))
print(batches2string(valid_batches.next()))
print(batches2string(valid_batches.next()))

[' anarchism ', 'ion from eu', 'esident lyn', 'r professio', ' minting af', 'concentrati', 'poken today', 'ber of hote', ' the bell a', 'overty with', 'anufacturer', ' britannica', 'd so david ', 'ea of worke', ' informatio', ' one hand a', 'hannel isla', 'ater reprin', 'r an invest', ' infinite a', 's explained', 'on of plant', ' be solved ', 'n nine resp', 'ation vital', 'self contai', 'ountless se', 'ce arag leg', 'are often c', 'ions eml th', 'ne such as ', 's livin thi', 'ecommunicat', ' one nine n', 'e six two z', ' computer g', 'ing to quan', 'erbicide to', ' special re', ' exotic per', 'rm indicati', 'a secular r', 'the frequen', ' synthesize', 'structural ', ' the media ', 'ail of tear', 'stems able ', 'anded by si', 'rophy assoc', ' union s pi', 'ired differ', 'hough not i', 'e minister ', 'y the king ', 'eudo histor', 'nounced at ', 'tics gullbe', 'sance compa', ' colleges w', 'et and sunr', 'e american ', 'he mass app', 'made such d']
[' originated', 'urope aided', 'ndon jo

In [39]:
def logprob(predictions, labels):
    """
        Log-probability of the true labels in a predicted batch.
    """
    predictions[predictions < 1e-10] = 1e-10
    return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def sample_distribution(distribution):
    """
        Sample one element from a distribution assumed to be an array of normalized
        probabilities.
    """
    r = random.uniform(0, 1)
    s = 0
    for i in range(len(distribution)):
        s += distribution[i]
        if s >= r:
            return i
    return len(distribution) - 1

def sample(prediction):
    """
        Turn a (column) prediction into 1-hot encoded samples
    """
    p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
    p[0, sample_distribution(prediction[0])] = 1.0
    return p

def random_distribution():
    """
        Generate a random column of probabilities
    """
    b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
    return b / np.sum(b, 1)[:, None]

In [40]:
num_nodes = 64

graph = tf.Graph()

with graph.as_default():
    # Parameters:
    # Input gate: input, previous output, bias
    ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
    im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    ib = tf.Variable(tf.zeros([1, num_nodes]))
    
    # Forget gate: input, previous output, bias
    fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
    fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    fb = tf.Variable(tf.zeros([1, num_nodes]))
    
    # Memory cell: input, previous output, bias
    cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
    cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    cb = tf.Variable(tf.zeros([1, num_nodes]))
    
    # Output gate: input, previous output, bias
    ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
    om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
    ob = tf.Variable(tf.zeros([1, num_nodes]))
    
    # Variables saving state across unrollings.
    saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
    
    # Classifier weights and biases
    w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
    b = tf.Variable(tf.zeros([vocabulary_size]))
    
    # Define cell computation
    def lstm_cell(i, o, state):
        """
            Create LSTM cell.
        """
        input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
        forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
        update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
        state = forget_gate * state + input_gate * tf.tanh(update)
        output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
        return output_gate * tf.tanh(state), state
    
    # Input data
    train_data = list()
    for _ in range(num_unrollings + 1):
        train_data.append(tf.placeholder(tf.float32, shape=[batch_size, vocabulary_size]))
    train_inputs = train_data[:num_unrollings]
    train_labels = train_data[1:]
    
    # Unrolled LSTM loop
    outputs = list()
    output = saved_output
    state = saved_state
    for i in train_inputs:
        output, state = lstm_cell(i, output, state)
        outputs.append(output)
        
    # State saving across unrollings
    with tf.control_dependencies([saved_output.assign(output), saved_state.assign(state)]):
        # Classifier
        logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
        loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = tf.concat(train_labels, 0),
                                                                      logits = logits))
    
    # Optimizer
    global_step = tf.Variable(0)
    learning_rate = tf.train.exponential_decay(10.0, global_step, 5000, 0.1, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    gradients, v = zip(*optimizer.compute_gradients(loss))
    gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
    optimizer = optimizer.apply_gradients(zip(gradients, v), global_step=global_step)
    
    # Predictions
    train_prediction = tf.nn.softmax(logits)
    
    # Sampling and validation eval: batch 1, no unrolling
    sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
    saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
    saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
    reset_sample_state = tf.group(
        saved_sample_output.assign(tf.zeros([1, num_nodes])),
        saved_sample_state.assign(tf.zeros([1, num_nodes]))
    )
    sample_output, sample_state = lstm_cell(sample_input, 
                                            saved_sample_output, 
                                            saved_sample_state)
    
    with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                  saved_sample_state.assign(sample_state)]):
        sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [41]:
num_steps = 20000
summary_frequency = 100

with tf.Session(graph = graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    
    print("Training weights on Text8")
    # Train on Text8
    mean_loss = 0
    
    for step in range(num_steps):
        batches = train_batches.next()
        feed_dict = dict()
        
        for i in range(num_unrollings + 1):
            feed_dict[train_data[i]] = batches[i]
            
        _, l, predictions, lr = session.run(
            [optimizer, loss, train_prediction, learning_rate], feed_dict = feed_dict)
        
        mean_loss += l
        
        if step % summary_frequency == 0:
            if step > 0:
                mean_loss = mean_loss / summary_frequency
                
            print("Average loss at step %d: %f learning rate: %f" % (step, mean_loss, lr))
            
            mean_loss = 0
            
            labels = np.concatenate(list(batches)[1:])
            print("Minibatch perplexity: %.2f" % float(
                np.exp(logprob(predictions, labels))
            ))
    
    
    
    print("Tunining weights on Harry Potter")
    # Tune on Harry Potter
    
    mean_loss = 0
    
    for step in range(num_steps):
        batches = tune_batches.next()
        feed_dict = dict()
        
        for i in range(num_unrollings + 1):
            feed_dict[train_data[i]] = batches[i]
            
        _, l, predictions, lr = session.run(
            [optimizer, loss, train_prediction, learning_rate], feed_dict = feed_dict)
        
        mean_loss += l
        
        if step % summary_frequency == 0:
            if step > 0:
                mean_loss = mean_loss / summary_frequency
                
            print("Average loss at step %d: %f learning rate: %f" % (step, mean_loss, lr))
            
            mean_loss = 0
            
            labels = np.concatenate(list(batches)[1:])
            print("Minibatch perplexity: %.2f" % float(
                np.exp(logprob(predictions, labels))
            ))
            
            if step % (summary_frequency * 10) == 0:
                # Generate some samples
                print('=' * 80)
                for _ in range(5):
                    feed = sample(random_distribution())
                    sentence = characters(feed)[0]
                    reset_sample_state.run()
                    for _ in range(79):
                        prediction = sample_prediction.eval({sample_input: feed})
                        feed = sample(prediction)
                        sentence += characters(feed)[0]
                    print(sentence)
                print('=' * 80)
            # Mesaure validation set perplexity
            reset_sample_state.run()
            valid_logprob = 0
            for _ in range(valid_size):
                b = valid_batches.next()
                predictions = sample_prediction.eval({sample_input: b[0]})
                valid_logprob = valid_logprob + logprob(predictions, b[1])
            print("Validation set perplexity: %.2f" % float(np.exp(valid_logprob / valid_size)))

Initialized
Training weights on Text8
Average loss at step 0: 3.294022 learning rate: 10.000000
Minibatch perplexity: 26.95
Average loss at step 100: 2.591139 learning rate: 10.000000
Minibatch perplexity: 11.54
Average loss at step 200: 2.261024 learning rate: 10.000000
Minibatch perplexity: 9.23
Average loss at step 300: 2.103871 learning rate: 10.000000
Minibatch perplexity: 6.86
Average loss at step 400: 2.014105 learning rate: 10.000000
Minibatch perplexity: 6.68
Average loss at step 500: 1.945172 learning rate: 10.000000
Minibatch perplexity: 7.31
Average loss at step 600: 1.911439 learning rate: 10.000000
Minibatch perplexity: 6.87
Average loss at step 700: 1.869281 learning rate: 10.000000
Minibatch perplexity: 6.25
Average loss at step 800: 1.837627 learning rate: 10.000000
Minibatch perplexity: 6.33
Average loss at step 900: 1.811218 learning rate: 10.000000
Minibatch perplexity: 5.81
Average loss at step 1000: 1.830704 learning rate: 10.000000
Minibatch perplexity: 6.50
Aver