<a href="https://colab.research.google.com/github/mvenouziou/text_generator/blob/main/Mo_Text_Generator_char_level.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Text Generation RNN

This program constructs an unsupervised character-level sequence model that can generate text according to a distribution learned from the dataset.

In [17]:
#### PACKAGE IMPORTS ####
# ML design
import tensorflow as tf
from tensorflow import keras
!pip install -q tensorflow-text
import tensorflow_text as text

# data handling
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import random

# other
import urllib.request
import os
import json
import pickle

##### GLOBAL VARIABLES
File directories and hyperparameters

In [18]:
# GLOBAL VARIABLES

# hyperparameters
BATCH_SIZE = 32
EMBEDDING_DIM = 256

AUTHOR = 'assorted'

# file directory structure in Google Drive
GDRIVE_DIR = '/content/gdrive/'
FILEPATH = GDRIVE_DIR + 'MyDrive/Colab_Notebooks/models/' + AUTHOR
CHECKPOINT_DIR = FILEPATH + '/checkpoints/'
CACHE_DIR = FILEPATH + '/cache/'
MODEL_DIR = FILEPATH + '/prediction_model/'
DATASETS_DIR = 'https://raw.githubusercontent.com/mvenouziou/text_generator/main/'

# dataset files (.txt and .csv allowed)
DATASET_PRIMARY = 'call_of_the_wild_jack_london.txt'
DATASETS = [DATASET_PRIMARY] + \
            ['sherlock_holmes.txt',
            'mark_twain_speeches.txt',
             'robert_frost_assorted.txt',
             'dorian_gray_oscar_wilde.txt',
            ]

# shuffle dataset preference
# must be False to use stateful RNN structure
SHUFFLE = True   
# stateful RNN works best with a single continuous work
if SHUFFLE is False and len(DATASETS) > 1:
    SHUFFLE = True
                

# mount google drive:
from google.colab import drive
drive.mount(GDRIVE_DIR)

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


#### Load and inspect the dataset

In [19]:
# Function: loader for .csv files
def prepare_csv(filename, datasets_dir=DATASETS_DIR):
    # load data
    dataframe = pd.read_csv(datasets_dir + filename)
    
    # cleanup specific to the robert frost set
    dataframe = dataframe.rename(columns={'Name ':'Name'})[1:]
    dataframe['Name'] = dataframe['Name'].apply(lambda x: x + ': ')
    
    # merge desired text columns and export as a single string
    dataframe['merged'] = dataframe['Name'] + dataframe['Content']
    text_list = dataframe['merged'].to_list()
    text_list = (' ').join(text_list)

    return text_list


# Function: Load and standardize data files
def load_parse(num_words_per_chunk=30, display_samples=True, 
               shuffle=SHUFFLE, datasets_dir=DATASETS_DIR, datasets=DATASETS):
    # NOTE: set shuffle=False if using a stateful model
    
    # initialize container to store text data
    text_data=''

    first_file = True
    for file in datasets:
        
        # for loading .csv files
        _, file_extension = os.path.splitext(file)     
        if file_extension == '.csv':
            new_text = prepare_csv(file)    

        # for loading .txt files
        else:
            if datasets_dir[:4] == 'http':
                with urllib.request.urlopen(datasets_dir + '/' + file) as file:
                    new_text = file.read().decode('utf-8')

            else:
                with open(datasets_dir + '/' + file, 'r', encoding='utf-8') as file:
                    new_text = file.read()
        
        # merge data sources
        # (adjust string length to be no longer than 
        # a specified multiple of first dataset length)
        multiple = 4
        if first_file:
            max_length = multiple * len(new_text)
        
        text_data = (' ').join([text_data, new_text[: max_length]])

        first_file = False

    # Create a list of chunks of text
    # remove paragraph / line marks and split up words
    tokenizer = text.WhitespaceTokenizer()

    # tokenize data (outputs bytestrings)
    words_byte = tokenizer.tokenize(text_data)
    words_byte = words_byte.numpy().tolist()   
    
    # convert data to string format
    words = [byte.decode() for byte in words_byte]  
    
    # combine into lists of consecutive words
    text_chunks = [(' ').join(words[i : i + num_words_per_chunk]) 
                    for i in range(0, len(words), num_words_per_chunk)]
    
    # shuffle data
    if shuffle:
        random.shuffle(text_chunks)

    # Display some text samples
    if display_samples:
        num_samples = 5
        inx = np.random.choice(len(text_chunks), num_samples, replace=False)
        for chunk in np.array(text_chunks)[inx]:
            print(chunk)
            print()

        print('len(words):', len(words))
        print('len(text_chunks):', len(text_chunks))

    return text_chunks


temp = load_parse(num_words_per_chunk=30, shuffle=True, display_samples=True)

the Quakers William Robinson, Marmaduke Stevenson, et al. Your tribe chased them out of the country for their religion's sake; promised them death if they came back; for your ancestors

was snapping like a demon. Once, his teeth closed on the fore leg of a husky, and he crunched down through the bone. Pike, the malingerer, leaped upon the crippled

United States.” I thank you all out of my heart for this fraternal welcome, and it seems almost too fine, almost too magnificent, for a humble Missourian such as I

No, I don't mean that. It was the first time I ever withdrew a watermelon. It was the first time I ever extracted a watermelon. That is exactly the word

in the wheel-pit?" "They say sometime was wasted on the belt—- Old streak of leather—doesn't love me much Because I made him spit fire at my knuckles, The way Ben

len(words): 327797
len(text_chunks): 10927


#### Encode data for model

In [20]:
# Function: Create and fit tokenizer object
def create_character_tokenizer(list_of_strings):
    """
    This function takes a list of strings as its argument. It should create 
    and return a Tokenizer according to the above specifications. 
    """

    # Initialize standard keras tokenizer
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
                    num_words=None,  # number of tokens is not limited
                    filters=None,  # no characters filtered
                    lower=False,  # original capitalization retained
                    char_level=True,  # tokens created at character level
                    )
    
    # fit tokenizer
    tokenizer.fit_on_texts(list_of_strings)

    return tokenizer


# Function: apply tokenizer
def strings_to_sequences(tokenizer, list_of_strings): 
    return tokenizer.texts_to_sequences(list_of_strings)  

#### Create input and target Datasets for stateful RNN

In [21]:
# Function: Apply padding for uniform length
def make_padded_dataset(sequence_chunks, max_len=500):
    """
    This function takes a list of lists of tokenized sequences, and transforms
    them into a 2D numpy array, padding the sequences as necessary according to
    the above specification. The function should then return the numpy array.
    """
    
    return tf.keras.preprocessing.sequence.pad_sequences(
                        sequences=sequence_chunks,  # dataset
                        maxlen=max_len, 
                        dtype='int32', 
                        padding='pre',
                        truncating='pre', 
                        value=0.0
                  )

In [22]:
def create_inputs_and_targets(array_of_sequences):
    """
    This function takes a 2D numpy array of token sequences, and returns a tuple of two
    elements: the first element is the input array and the second element is the output
    array, which are defined according to the above specification.
    """   
    input_arr = array_of_sequences[:, :-1]
    target_arr = array_of_sequences[:, 1:]

    return input_arr, target_arr
    

Note: Our text data are not independent random samples. Content from earlier in the text can inform future predictions. In this case we can use a 'stateful' model to capture some of this dependence relation.  (Data should be a single continuous (unshuffled) text for this to work. Do not use it in other instances.) 



In [23]:
# Function: data prep to create stateful RNN batches
def preprocess_stateful(input, target, batch_size=BATCH_SIZE):

    # Prepare input and output arrays for training the stateful RNN
    num_examples = input.shape[0]

    # we'll reduce the sample size so all batches have the same dimension
    num_processed_examples = num_examples - (num_examples % batch_size)

    # restrict data to new sample size
    input_cropped = input[:num_processed_examples]
    target_cropped = target[:num_processed_examples]

    steps = num_processed_examples // 32  # num steps needed per epoch 
                                            # (to process all the batches)

    # create array defining the order for samples to be processed
    inx = np.empty((0,), dtype=np.int32)  # initialize array object
    for i in range(steps):
        inx = np.concatenate((inx, i + np.arange(0, num_processed_examples, 
                                                 steps)))

    # reorder the data
    input_seq_stateful = input_cropped[inx]
    target_seq_stateful = target_cropped[inx]

    return input_seq_stateful, target_seq_stateful

In [24]:
# Function: Split into training and validation Dataset objects
def make_Dataset(input, target, validation_split=.2,
                 batch_size=BATCH_SIZE):

    sample_size = input.shape[0]
    validation_size = int(validation_split * sample_size)

    # split into train / validation sets
    input_valid = input[:validation_size]
    target_valid = target[:validation_size]
    
    input_train = input[validation_size:]
    target_train = target[validation_size:]

    # convert to tensorflow Dataset batches
    def convert_to_Dataset(input_array, target_array):
        dataset = tf.data.Dataset.from_tensor_slices(
                            (input_array, target_array))
        
        dataset = (dataset.batch(batch_size, drop_remainder=True)\
                          .prefetch(tf.data.experimental.AUTOTUNE))
        return dataset

    train_dataset = convert_to_Dataset(input_train, target_train)
    valid_dataset = convert_to_Dataset(input_valid, target_valid)

    return train_dataset, valid_dataset

#### Define RNN model

In [25]:
# Function: Model Definition
def get_model(vocab_size, batch_size=BATCH_SIZE, embedding_dim=EMBEDDING_DIM):
    """ Defines and compiles a Sequential RNN """
    
    from keras.layers import Embedding, GRU, Dense, Dropout

    model = keras.Sequential([
                Embedding(input_dim=vocab_size, 
                          output_dim=embedding_dim,
                          mask_zero=True, 
                          batch_input_shape=(batch_size, None)
                          ),
                GRU(units=1024, 
                    stateful=True, 
                    return_sequences=True,
                    ),
                GRU(units=1024, 
                    stateful=True, 
                    return_sequences=True,
                    name='GRU_1',
                    ),
                Dense(embedding_dim
                      ),
                Dropout(rate=.05
                         ),
                Dense(units=vocab_size, 
                      activation=None,
                      )          
    ])

    model.compile(optimizer='adam', 
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(
                                                        from_logits=True),
                  metrics=['sparse_categorical_accuracy', 
                           'sparse_categorical_crossentropy']
                  )

    return model

In [26]:
# Function: Train model
def train_model(model, train_data, validation_data, epochs, 
                batch_size=BATCH_SIZE, filepath=FILEPATH, 
                checkpoint_dir=CHECKPOINT_DIR):

    from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

    # define checkpoint file naming format
    checkpoint_filename = checkpoint_dir + \
            'ckpt_{epoch:02d}_{sparse_categorical_accuracy:04f}.ckpt'

    # set checkpoint options
    save_freq=4*BATCH_SIZE
    checkpoint = ModelCheckpoint(
                        filepath=checkpoint_filename,
                        monitor='sparse_categorical_accuracy',
                        save_weights_only=True,
                        save_best_only=False,
                        save_freq=save_freq,
                        verbose=1)

    # define early stopping criteria
    stopping = EarlyStopping(
                    monitor="val_loss",
                    min_delta=1e-2,
                    patience=2,
                    verbose=1,
                )

    history = model.fit(train_data.repeat(100), 
                        validation_data=valid_data.repeat(100), 
                        epochs=epochs, 
                        validation_steps=save_freq, 
                        callbacks=[checkpoint, stopping],
                        batch_size=batch_size,
                        )
    
    return model, history

### Create Text Generation Function
Adapt trained model

In [27]:
# Final Prediction Function #######

# Use the model to generate a token sequence
def make_prediction(init_string, num_generation_steps, precision_reduction=0,
                    model_name=AUTHOR, print_result=False):
    
    our_model = prediction_model
    our_tokenizer = tokenizer

    GRU_layer = our_model.get_layer('GRU_1')

    #batch_size=1
    
    token_sequence = our_tokenizer.texts_to_sequences([init_string])
    initial_state = None
    input_sequence = token_sequence
    init_len = len(input_sequence[0])

    for i in range(num_generation_steps):
        logits = get_logits(our_model, input_sequence, initial_state=initial_state)
        sampled_token = sample_token(logits, precision_reduction)
        token_sequence[0].append(sampled_token)
        input_sequence = [[sampled_token]]  # use only last letter because previous model state is carried forward
        initial_state = GRU_layer.states[0].numpy()

    predicted_text = our_tokenizer.sequences_to_texts(token_sequence)[0][::2]
    
    if print_result:
        print(predicted_text)

    return predicted_text

*************************************
*************************************

### Implement Functions to Create Models

*************************************
*************************************

##### Preprocess Data

In [28]:
def get_tokenizer():

    # Load Data ##########
    text_chunks = load_parse(num_words_per_chunk=25, 
                             display_samples=True, shuffle=SHUFFLE)

    # Encoding Map ##########
    # Get fitted tokenizer
    tokenizer = create_character_tokenizer(text_chunks)

    return tokenizer, text_chunks


tokenizer, text_chunks = get_tokenizer()

is Lestrade! Good-afternoon, Lestrade! You will find an extra tumbler upon the sideboard, and there are cigars in the box.” The official detective was attired

natural, and affectionate then. You were the most unspoiled creature in the whole world. Now, I don't know what has come over you. You talk

Each day the sun rose earlier and set later. It was dawn by three in the morning, and twilight lingered till nine at night. The

several times and waited. No: everything was still. It was merely the sound of his own footsteps. When he reached the library, he saw the

will leave the country, sir. Then the charge against him will break down.” “Hum! We will talk about that. And now let us hear a

len(words): 327797
len(text_chunks): 13112


In [29]:
def preprocess_data(tokenizer, text_chunks, cache_dir=CACHE_DIR):

    # Create Datasets ##########
    # Apply encoding to text (using tokenizer)
    seq_chunks = strings_to_sequences(tokenizer, text_chunks)

    # Pad sequence with zeros to establish uniform lengths
    padded_sequences = make_padded_dataset(seq_chunks)

    # Convert sequences into input and target arrays
    input_seq, target_seq = create_inputs_and_targets(padded_sequences)

    # rearrange data into batches for use in stateful model
    # (cannot use if data is shuffled)
    if SHUFFLE==False:
        input_seq, target_seq = \
            preprocess_stateful(input_seq, target_seq)

    # Create the training and validation Datasets
    train_data, valid_data = \
        make_Dataset(input_seq, target_seq, 
                     validation_split=.2)
                    
    # save / load datasets
    #train_data = train_data.cache(cache_dir)   
    #valid_data = valid_data.cache(cache_dir) 

    print(train_data)

    return train_data, valid_data


train_data, valid_data = \
    preprocess_data(tokenizer, text_chunks)

<PrefetchDataset shapes: ((32, 499), (32, 499)), types: (tf.int32, tf.int32)>


##### Create / Train Model

In [30]:
def build_init_model(train_data, valid_data, tokenizer,
                     filepath=FILEPATH, show_summary=False):

    """ Initializes and trains a model for 1 epoch, 
    and saves it to disk """

    model = get_model(len(tokenizer.word_index) + 1)

    if show_summary:
        model.summary()

    # train the model for one epoch
    model, history = \
        train_model(model, train_data, valid_data, epochs=1)

    # Save model
    model.save(filepath)

    return model, history

In [None]:
# Load existing model (if it exists). 
# Otherwise, create a new model

try:
    # get model weights
    checkpoint = tf.train.latest_checkpoint(CHECKPOINT_DIR)
    
    # recreate model
    init_model = get_model(len(tokenizer.word_index) + 1)
    init_model.load_weights(checkpoint)
    init_model.summary()

except: 

    init_model, init_history = \
        build_init_model(train_data, valid_data, tokenizer, show_summary=True)
        


Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (32, None, 256)           24576     
_________________________________________________________________
gru_2 (GRU)                  (32, None, 1024)          3938304   
_________________________________________________________________
GRU_1 (GRU)                  (32, None, 1024)          6297600   
_________________________________________________________________
dense_4 (Dense)              (32, None, 256)           262400    
_________________________________________________________________
dropout_2 (Dropout)          (32, None, 256)           0         
_________________________________________________________________
dense_5 (Dense)              (32, None, 96)            24672     
Total params: 10,547,552
Trainable params: 10,547,552
Non-trainable params: 0
__________________________________________

In [None]:
# add additional training epochs
train_more = True
additional_epochs = 1

if train_more:  
    init_model, init_history = train_model(init_model, 
                                           train_data, 
                                           valid_data, 
                                           epochs=additional_epochs)

    # save model 
    # note: model weights also saved via checkpoint callback
    init_model.save(filepath)

********************************************************
********************************************************

##### Adapt model to accept single line inputs (batch size = 1).

In [None]:
# Function: produce modified model for making text predictions
def get_prediction_model(tokenizer, author=AUTHOR, 
                         checkpoint_dir=CHECKPOINT_DIR):
    
    # get model, setting batch size = 1
    our_tokenizer = tokenizer
    print(our_tokenizer)
    user_model = get_model(len(our_tokenizer.word_index) + 1, batch_size=1)
    checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)

    # load weights
    user_model.load_weights(checkpoint_file)  
    user_model.trainable=False

    return user_model


# Store trained model separate from checkpoints
def save_model_update(model_dir=MODEL_DIR):

    # save model
    prediction_model = get_prediction_model(tokenizer)
    prediction_model.save(model_dir)

    # save tokenizer
    with open(model_dir + '/tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return None

# Generate text from the model

In [None]:
# load prediction model
def load_model_update(model_dir=MODEL_DIR):
   
    prediction_model =  \
        tf.keras.models.load_model(model_dir)

    # load tokenizer
    with open(model_dir + '/tokenizer.pickle', 'rb') as handle:
        tokenizer = pickle.load(handle)

    return prediction_model, tokenizer


prediction_model, tokenizer = load_model_update() 
prediction_model.summary()

In [None]:
# Function: Outputs weighted predictions for next token 
# (as logits / log-odds ration)
def get_logits(model, token_sequence, initial_state=None):
    """
    Paramater:
    model - our prediction model set with batch size = 1
    """
    
    # carry forward previous state from GRU layer
    GRU_layer = model.get_layer('GRU_1')
    GRU_layer.reset_states(initial_state)

    # Get the model's next token prediction (as logits)
    input = tf.constant(token_sequence)
    final_pred = model(input)[:, -1, :]
        
    return final_pred.numpy()    


# Function: selects a value from logits prediction
def sample_token(logits, precision_reduction=0):   

    # choose a value from logits distribution
    # fuzz_factor: adds some imprecision to results
    fuzz_factor = tf.random.normal(shape=logits.shape, mean=1, stddev=.2)

    sample = tf.random.categorical(
                        logits=logits * (1 + precision_reduction * fuzz_factor), 
                        num_samples=1, 
                        )

    # convert to integer
    next_token = sample[0,0].numpy()

    return next_token

In [None]:
# test model
init_string = 'EMMY:'  # starting point for prediction
num_generation_steps = 300  # max number characters to produce


make_prediction(init_string, num_generation_steps, 
                precision_reduction=5, print_result=True)


In [None]:
def generate_text(starting_text, precision_reduction=0, author='buy_local'):

  num_generation_steps = 350  # max number characters to produce
  
  prediction = make_prediction(init_string=starting_text, 
                               num_generation_steps=num_generation_steps, 
                               precision_reduction=precision_reduction, 
                               model_name=author, 
                               print_result=False)
  
  return prediction + '...'


generate_text(starting_text='hendrix', author='buy_local')