In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
%tensorflow_version 2.x

TensorFlow 2.x selected.


In [0]:
import os 
os.chdir("/content/drive/My Drive/Colab Notebooks/")

In [4]:
import argparse
import pickle
from math import ceil
from time import localtime
import numpy as np

import tensorflow as tf


def files_to_tfrecord_fixedlen(*files, out_path, seq_len=200, overlap=0):
    """Process a number of text files into TFRecords data file.

    All files are conjoined into one big string. For simplicity, we split this
    string into equal-length sequences of seq_len-1 characters each.
    Furthermore, a special "beginning-of-sequence" character is prepended to
    each sequence, and the characters are mapped to integer indices
    representing one-hot vectors. We store the processed sequences into a
    TFrecords file; we also store the character-index mapping (vocabulary).

    Parameters:
        files: Paths to the text files to use for the corpus.
        out_path: Path to store the processed corpus, *without* file extension!
        seq_len: Requested sequence length.
        overlap: Float between 0 and 1. How much overlap there should be
                 between sequences. E.g. with a seq_len of 200 and an overlap
                 of 0.1, we only advance 180 characters between successive
                 sequences. Rounded down.
    """
    if not 0 <= overlap < 1:
        raise ValueError("Invalid overlap specified: {}. Please use a number "
                         "between 0 (inclusive) and 1 (exclusive).")

    full_text = "\n".join(open(file).read() for file in files)
    # we create a mapping from characters to integers, including a special
    # "beginning of sequence" character
    chars = set(full_text)
    ch_to_ind = dict(zip(chars, range(1, len(chars)+1)))
    ch_to_ind["<S>"] = 0

    seqs = text_to_seqs(full_text, seq_len, ch_to_ind, overlap)
    print("Split input into {} sequences...".format(len(seqs)))

    with tf.io.TFRecordWriter(out_path + ".tfrecords") as writer:
        for ind, seq in enumerate(seqs):
            tfex = tf.train.Example(features=tf.train.Features(feature={
                "seq": tf.train.Feature(int64_list=tf.train.Int64List(value=seq))
            }))
            writer.write(tfex.SerializeToString())
            if (ind + 1) % 100 == 0:
                print("Serialized {} sequences...".format(ind+1))
    pickle.dump(ch_to_ind, open(out_path + "_vocab", mode="wb"))


def text_to_seqs(text, seq_len, mapping, overlap):
    """Convert a string to a list of lists of equal length.

    Each character is mapped to its index as given by the mapping parameter.
    Right now this will actually use sequences *one character shorter* than
    requested, but prepend a "beginning of sequence" character.

    Parameters:
        text: String, the corpus.
        seq_len: Requested sequence length. See note above.
        mapping: Dict mapping characters to indices.
        overlap: Float between 0 and 1. How much overlap there should be
                 between sequences. E.g. with a seq_len of 200 and an overlap
                 of 0.1, we only advance 180 characters between successive
                 sequences. Rounded up.

    Returns:
        List of split character-index sequences.
    """
    use_bos = True
    if use_bos:
        seq_len -= 1

    steps_to_advance = seq_len - int(ceil(overlap * seq_len))

    seqs = [[mapping["<S>"]] + chs_to_inds(text[ind:(ind+seq_len)], mapping)
            for ind in range(0, len(text), steps_to_advance)]
    # we throw away any sequences that ended up shorter (usually at the very end)
    return [seq for seq in seqs if len(seq) == len(seqs[0])]


def chs_to_inds(char_list, mapping):
    """Helper to convert a list of characters to a list of corresponding indices.

    Parameters:
        char_list: List of characters (or string).
        mapping: Dict mapping characters to indices.

    Returns:
        List of character indices.
    """
    return [mapping[ch] for ch in char_list]


def parse_seq(example_proto, seq_len):
    """
    Needed to read the stored .tfrecords data -- import this in your
    training script.

    Parameters:
        example_proto: Protocol buffer of single example.
        seq_len: The sequence length corresponding to the example.

    Returns:
        Tensor containing the parsed sequence.
    """
    features = {"seq": tf.io.FixedLenFeature((seq_len,), tf.int64)}
    parsed_features = tf.io.parse_single_example(example_proto, features)
    return tf.cast(parsed_features["seq"], tf.int32)


files_to_tfrecord_fixedlen("input.txt", out_path="output",
                            seq_len=200, overlap=0)

Split input into 5604 sequences...
Serialized 100 sequences...
Serialized 200 sequences...
Serialized 300 sequences...
Serialized 400 sequences...
Serialized 500 sequences...
Serialized 600 sequences...
Serialized 700 sequences...
Serialized 800 sequences...
Serialized 900 sequences...
Serialized 1000 sequences...
Serialized 1100 sequences...
Serialized 1200 sequences...
Serialized 1300 sequences...
Serialized 1400 sequences...
Serialized 1500 sequences...
Serialized 1600 sequences...
Serialized 1700 sequences...
Serialized 1800 sequences...
Serialized 1900 sequences...
Serialized 2000 sequences...
Serialized 2100 sequences...
Serialized 2200 sequences...
Serialized 2300 sequences...
Serialized 2400 sequences...
Serialized 2500 sequences...
Serialized 2600 sequences...
Serialized 2700 sequences...
Serialized 2800 sequences...
Serialized 2900 sequences...
Serialized 3000 sequences...
Serialized 3100 sequences...
Serialized 3200 sequences...
Serialized 3300 sequences...
Serialized 3400 s

In [28]:
def train(num_steps, input_file, vocab_size, seq_length=200):
    dataset = tf.data.TFRecordDataset(input_file)
    dataset = dataset.map(lambda x: parse_seq(x, seq_length))
    dataset = dataset.batch(128)

    hidden_size = 512  # size of hidden layer of neurons
    learning_rate = 0.01

    # model parameters
    Wxh = tf.Variable(np.random.randn(vocab_size, hidden_size).astype('float32') * 0.01)  # input to hidden
    Whh = tf.Variable(np.random.randn(hidden_size, hidden_size).astype('float32') * 0.01)  # hidden to hidden
    Why = tf.Variable(np.random.randn(hidden_size, vocab_size).astype('float32') * 0.01)  # hidden to output
    bh = tf.Variable(np.zeros((1, hidden_size)).astype('float32'))  # hidden bias
    by = tf.Variable(np.zeros((1, vocab_size)).astype('float32'))  # output bias

    for  step in range(num_steps):
        total_loss = tf.zeros(1)
        for raw_record in dataset.take(1):
            x_seq = raw_record
            itr = 0
            for i in range(1, seq_length):
                t = i - 1
                x_seq_hot = tf.one_hot(x_seq, depth=vocab_size, axis=-1)

                y = x_seq[:, t + 1]
                y_hot = tf.one_hot(y, vocab_size, axis=-1)

                if itr == 0:
                    h_init = tf.zeros((1, hidden_size))

                x_chat_t = x_seq_hot[:, t, :]


                with tf.GradientTape() as tape:
                    a = tf.matmul(x_chat_t, Wxh) + tf.matmul(h_init, Whh) + bh
                    h_init = tf.nn.relu(a)
                    out = tf.matmul(h_init, Why) + by
                    loss = tf.nn.softmax_cross_entropy_with_logits(labels=y_hot, logits=out)
                    mean_loss = tf.reduce_mean(loss)
                    total_loss = total_loss + mean_loss


                grads = tape.gradient(mean_loss, [Wxh, Whh, bh, Why, by])
                Wxh.assign_sub(learning_rate * grads[0])
                Whh.assign_sub(learning_rate * grads[1])
                bh.assign_sub(learning_rate * grads[2])
                Why.assign_sub(learning_rate * grads[3])
                by.assign_sub(learning_rate * grads[4])

                itr = itr + 1
                print(mean_loss)

    all_weights = np.array([Wxh, Whh,bh,Why, by])
    np.save('rnn_weights', all_weights)

file = 'output.tfrecords'
vocab = 'output_vocab'
train_model = True
apply_model = False

with open(vocab, "rb") as f:
    vocab = pickle.load(f)

if train_model:
    input_file = file
    train(100, input_file, len(vocab))





tf.Tensor(4.1892123, shape=(), dtype=float32)
tf.Tensor(4.188915, shape=(), dtype=float32)
tf.Tensor(4.188575, shape=(), dtype=float32)
tf.Tensor(4.1880274, shape=(), dtype=float32)
tf.Tensor(4.1877317, shape=(), dtype=float32)
tf.Tensor(4.1869974, shape=(), dtype=float32)
tf.Tensor(4.186994, shape=(), dtype=float32)
tf.Tensor(4.186244, shape=(), dtype=float32)
tf.Tensor(4.1866436, shape=(), dtype=float32)
tf.Tensor(4.1859922, shape=(), dtype=float32)
tf.Tensor(4.1853027, shape=(), dtype=float32)
tf.Tensor(4.185504, shape=(), dtype=float32)
tf.Tensor(4.1843657, shape=(), dtype=float32)
tf.Tensor(4.1837597, shape=(), dtype=float32)
tf.Tensor(4.182946, shape=(), dtype=float32)
tf.Tensor(4.183378, shape=(), dtype=float32)
tf.Tensor(4.181553, shape=(), dtype=float32)
tf.Tensor(4.1807933, shape=(), dtype=float32)
tf.Tensor(4.18349, shape=(), dtype=float32)
tf.Tensor(4.1812553, shape=(), dtype=float32)
tf.Tensor(4.1804433, shape=(), dtype=float32)
tf.Tensor(4.1792116, shape=(), dtype=float32

In [41]:
def gen_data(input_file, vocab, seq_length=200):
    vocab_size = len(vocab)
    reverse_vocab = dict((y, x) for x, y in vocab.items())
    dataset = tf.data.TFRecordDataset(input_file)
    dataset = dataset.map(lambda x: parse_seq(x, seq_length))
    dataset = dataset.batch(1)
    hidden_size = 512

    all_weights = np.load('rnn_weights.npy', allow_pickle=True)

    [Wxh, Whh,bh,Why, by] = all_weights

    for raw_record in dataset.take(1):
        x_seq = raw_record
        itr = 0
        for i in range(1, seq_length):
            t = i - 1
            x_seq_hot = tf.one_hot(x_seq, depth=vocab_size, axis=-1)

            y = x_seq[:, t + 1]
            y_hot = tf.one_hot(y, vocab_size, axis=-1)

            if itr == 0:
              h_init = tf.zeros((1, hidden_size))

            x_chat_t = x_seq_hot[:, t, :]

            a = tf.matmul(x_chat_t, Wxh) + tf.matmul(h_init, Whh) + bh
            h_init = tf.nn.relu(a)
            out = tf.matmul(h_init, Why) + by

            itr = itr + 1

            y_t = tf.argmax(out, axis=1)
            char = reverse_vocab[y_t.numpy()[0]]
            print(char, end='')


gen_data('output.tfrecords', vocab, seq_length=200)

  n   aot ne  

e  n  to toeue   tnd tote e e te t to tee n 

Aoe  
e  n  th  n 

Aon   aot ne  

er tne tneete  nee  tet e ethutet then thutoten e 
Aoe  
h  n    
ho  n e  

Aen   aot ne  

one   to