In [1]:
# from textgenrnn.gradient_noise import add_gradient_noise
from textgenrnn import textgenrnn
from datetime import datetime
import os

Using TensorFlow backend.


## Check GPU

In [2]:
import tensorflow as tf
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## Training

Set the textgenrnn model configuration here: the default parameters here give good results for most workflows. (see the [demo notebook](https://github.com/minimaxir/textgenrnn/blob/master/docs/textgenrnn-demo.ipynb) for more information about these parameters)

If you are using an input file where documents are line-delimited, make sure to set `line_delimited` to `True`.

In [6]:
model_cfg = {
    'word_level': True,   # set to True if want to train a word-level model (requires more data and smaller max_length)
    'rnn_size': 128,   # number of LSTM cells of each layer (128/256 recommended)
    'rnn_layers': 3,   # number of LSTM layers (>=2 recommended)
    'rnn_bidirectional': False,   # consider text both forwards and backward, can give a training boost
    'max_length': 30,   # number of tokens to consider before predicting the next (20-40 for characters, 5-10 for words recommended)
    'max_words': 10000,   # maximum number of words to model; the rest will be ignored (word-level model only)
}

train_cfg = {
    'line_delimited': True,   # set to True if each text has its own line in the source file
    'num_epochs': 10,   # set higher to train the model for longer
    'gen_epochs': 1,   # generates sample text from model after given number of epochs
    'train_size': 0.8,   # proportion of input data to train on: setting < 1.0 limits model from learning perfectly
    'dropout': 0.0,   # ignore a random proportion of source tokens each epoch, allowing model to generalize better
    'validation': True,   # If train__size < 1.0, test on holdout dataset; will make overall training slower
    'is_csv': False,   # set to True if file is a CSV exported from Excel/BigQuery/pandas
    'multi_gpu': False,
    'dp': True,
    'noise_eta': 0.2, 
    'noise_gamma': 0.55,
    'clipnorm': 1.0,
}

In the Colaboratory Notebook sidebar on the left of the screen, select *Files*. From there you can upload files:

![alt text](https://i.imgur.com/TGcZT4h.png)

Upload **any text file** and update the file name in the cell below, then run the cell.

In [8]:
file_name = "data/ptb.train.txt"
model_name = 'models/eta_02/eta_0.2_gamma_0.55_clip_1'   # change to set file name of resulting trained models/texts

The next cell will start the actual training. And thanks to the power of Keras's CuDNN layers, training is super-fast when compared to CPU training on a local machine!

Ideally, you want a training loss less than `1.0` in order for the model to create sensible text consistently.

In [10]:
textgen = textgenrnn(name=model_name)

train_function = textgen.train_from_file if train_cfg['line_delimited'] else textgen.train_from_largetext_file

train_function(
    file_path=file_name,
    new_model=True,
    num_epochs=train_cfg['num_epochs'],
    gen_epochs=train_cfg['gen_epochs'],
    batch_size=1024,
    train_size=train_cfg['train_size'],
    dropout=train_cfg['dropout'],
    validation=train_cfg['validation'],
    is_csv=train_cfg['is_csv'],
    rnn_layers=model_cfg['rnn_layers'],
    rnn_size=model_cfg['rnn_size'],
    rnn_bidirectional=model_cfg['rnn_bidirectional'],
    max_length=model_cfg['max_length'],
    dim_embeddings=100,
    word_level=model_cfg['word_level'],
    dp=train_cfg['dp'],
    noise_eta=train_cfg['noise_eta'],
    noise_gamma=train_cfg['noise_gamma'],
    clipnorm=train_cfg['clipnorm'],
)

42,067 texts collected.
Training new model w/ 3-layer, 128-cell LSTMs
Implement differential privacy
noise_eta: 0.2
noise_gamma: 0.55
clipnorm: 1.0
Training on 854,826 word sequences.
  ...
    to  
  ['...']
  ...
    to  
  ['...']
Train for 834 steps, validate for 209 steps
Epoch 1/10
Temperature: 0.2
####################
the company said it was n'be sold the n'

the company said it expects to $n million or $n a share a $n million

the company said it expects to $n million or $n a share a share

####################
Temperature: 0.5
####################
the patent who was the n's a criminal number of the n's

the new york city's resignation bush was a lot of of the political and of the country

the company said it expects to $n million of $n million of $n million

####################
Temperature: 1.0
####################
predictions is expected to sell the bank of three jobs

three days n minutes to mesa named of individuals and are being the in case of the foundation that world ju

volume totaled n shares a minute

the equity is expected to close class in a year and mortgage offer and $n a share down

the chance apparently pointed their income

Epoch 8/10
Temperature: 0.2
####################
the company said it will registration to buy back as many as n million shares of honeywell stock said

the company said it will registration with the sale of its fleet of n to n n and n n of the u s market share

the company said it expects to use the proceeds to reduce the debt market for the n n of the company's n million common shares outstanding

####################
Temperature: 0.5
####################
volokh is a good job of making a project that would like a to defend himself from the convicted of her brother's wife

the company has offered to $n million through the notes and then handles it it has n'fully complete

but the company's earnings decline reflected a slower growth and competitive position

####################
Temperature: 1.0
####################
at the 

In [11]:
textgen.model.optimizer.get_config()

{'name': 'Adam',
 'clipnorm': 1.0,
 'learning_rate': 0.0004,
 'decay': 0.0,
 'beta_1': 0.9,
 'beta_2': 0.999,
 'epsilon': 1e-07,
 'amsgrad': False,
 'noise_eta': 0.20000000298023224,
 'noise_gamma': 0.550000011920929}

## Load and generate texts

In [None]:
model_name = "0221_1730_word"
textgen = textgenrnn(name=model_name, 
                     vocab_path=model_name+"_vocab.json",
                     config_path=model_name+"_config.json",
                     weights_path=model_name+"_weights.hdf5")

In [None]:
temperature = [1.0, 0.5, 0.2]
file_name = model_name+"_generated_text.txt"
with open(file_name, "w") as f:
    for temp in temperature:
        f.write(f"======= temperature: {temp} =======\n")
        f.write("\n".join(textgen.generate(10, temperature=temp, return_as_list=True)))
        f.write("\n")