<a href="https://colab.research.google.com/github/ll3091/ANLY-580-01-NLP-Project/blob/master/MedTextGenerationModels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP Project: Text Generation Model Training

In [1]:
# source https://github.com/minimaxir/textgenrnn
! pip install textgenrnn

Collecting textgenrnn
[?25l  Downloading https://files.pythonhosted.org/packages/ad/f8/f1968b2078a9076f481916fba5d98affa019943e4f5764224ffaeb57b7c7/textgenrnn-1.4.1.tar.gz (1.7MB)
[K    100% |████████████████████████████████| 1.7MB 8.0MB/s 
Building wheels for collected packages: textgenrnn
  Running setup.py bdist_wheel for textgenrnn ... [?25l- \ done
[?25h  Stored in directory: /root/.cache/pip/wheels/30/96/f7/bc7042ea671bc79455c244af21050a7a32d604fe2f7a44e322
Successfully built textgenrnn
Installing collected packages: textgenrnn
Successfully installed textgenrnn-1.4.1


In [2]:
from textgenrnn import textgenrnn
from google.colab import drive

Using TensorFlow backend.


In [0]:
# connect to Google drive
drive.mount('/content/gdrive/')

In [3]:
! ls

gdrive	sample_data  songs1000.txt  songs100.txt  songs2500.txt  songs500.txt


In [6]:
! ls gdrive/'My Drive'/NLPProject

DataExploration.ipynb		 songdata.csv
jokes.txt			 songs1000.txt
MedTextGenerationModels.ipynb	 songs100.txt
ModelEvaluation.ipynb		 songs2500.txt
ModelTrainingOutput		 songs500.txt
motivational_quotes.txt		 TrainedModels
ShortTextGenerationModels.ipynb  trump_tweets.txt


## Character-Level RNNs

In [0]:
# model configuration
model_cfg = {
    'rnn_size': 128,
    'rnn_layers': 4,
    'rnn_bidirectional': True,
    'max_length': 40,
    'max_words': 300,
    'dim_embeddings': 100,
    'word_level': False,
}

train_cfg = {
    'line_delimited': False,
    'num_epochs': 10,
    'gen_epochs': 2,
    'batch_size': 512,
    'train_size': 0.8,
    'dropout': 0.25,
    'max_gen_length': 150,
    'validation': True,
    'is_csv': False
}

### Song Lyrics

In [0]:
dir = './gdrive/My Drive/NLPProject/'
nums = [100, 500, 1000, 2500]
files = [dir+'songs'+str(n)+'.txt' for n in nums]

In [9]:
files

['./gdrive/My Drive/NLPProject/songs100.txt',
 './gdrive/My Drive/NLPProject/songs500.txt',
 './gdrive/My Drive/NLPProject/songs1000.txt',
 './gdrive/My Drive/NLPProject/songs2500.txt']

In [0]:
for file, n in zip(files, nums):
  print('Training with', str(n), 'songs from', file)
  model_name = 'char_'+'songs'+str(n)
  textgen = textgenrnn(name=model_name)

  train_function = textgen.train_from_file if train_cfg['line_delimited'] else textgen.train_from_largetext_file

  train_function(
      file_path=file,
      new_model=True,
      num_epochs=train_cfg['num_epochs'],
      gen_epochs=train_cfg['gen_epochs'],
      batch_size=train_cfg['batch_size'],
      train_size=train_cfg['train_size'],
      dropout=train_cfg['dropout'],
      max_gen_length=train_cfg['max_gen_length'],
      validation=train_cfg['validation'],
      is_csv=train_cfg['is_csv'],
      rnn_layers=model_cfg['rnn_layers'],
      rnn_size=model_cfg['rnn_size'],
      rnn_bidirectional=model_cfg['rnn_bidirectional'],
      max_length=model_cfg['max_length'],
      dim_embeddings=model_cfg['dim_embeddings'],
      word_level=model_cfg['word_level'])

Training with 100 songs from ./gdrive/My Drive/NLPProject/songs100.txt
Training new model w/ 4-layer, 128-cell Bidirectional LSTMs
Training on 94,607 character sequences.
Epoch 1/10
Epoch 2/10
####################
Temperature: 0.2
####################
                                                                                                                                                      

 e                                                                                                                     
             o                

                                                                o     o                                                    eo  a                      

####################
Temperature: 0.5
####################
   a   a tol h       e   b er  o      e aom v
     h  i     ae  h
u   
 o  lr    or  
 o
 a  o t   ta  a   u  e  e ma   ty o  n 
 hty y  seo     e t h

 
hed
 
don t   
   h h  ttml y    
  h    t tT ooa d  e   ar  ttlety      a    l

In [0]:
! cp char_songs100_config.json ./gdrive/'My Drive'/NLPProject/
! cp char_songs100_vocab.json ./gdrive/'My Drive'/NLPProject/
! cp char_songs100_weights.hdf5 ./gdrive/'My Drive'/NLPProject/

! cp char_songs500_config.json ./gdrive/'My Drive'/NLPProject/
! cp char_songs500_vocab.json ./gdrive/'My Drive'/NLPProject/
! cp char_songs500_weights.hdf5 ./gdrive/'My Drive'/NLPProject/

! cp char_songs1000_config.json ./gdrive/'My Drive'/NLPProject/
! cp char_songs1000_vocab.json ./gdrive/'My Drive'/NLPProject/
! cp char_songs1000_weights.hdf5 ./gdrive/'My Drive'/NLPProject/

! cp char_songs2500_config.json ./gdrive/'My Drive'/NLPProject/
! cp char_songs2500_vocab.json ./gdrive/'My Drive'/NLPProject/
! cp char_songs2500_weights.hdf5 ./gdrive/'My Drive'/NLPProject/

## Word-Level RNNs

In [0]:
# model configuration
model_cfg = {
    'rnn_size': 128,
    'rnn_layers': 4,
    'rnn_bidirectional': True,
    'max_length': 10,
    'max_words': 10000,
    'dim_embeddings': 100,
    'word_level': True,
}

train_cfg = {
    'line_delimited': False,
    'num_epochs': 50,
    'gen_epochs': 5,
    'batch_size': 512,
    'train_size': 0.8,
    'dropout': 0.25,
    'max_gen_length': 80,
    'validation': True,
    'is_csv': False
}

### Song Lyrics

In [0]:
for file, n in zip(files, nums):
  print('Training with', str(n), 'songs from', file)
  model_name = 'word_'+'songs'+str(n)
  textgen = textgenrnn(name=model_name)

  train_function = textgen.train_from_file if train_cfg['line_delimited'] else textgen.train_from_largetext_file

  train_function(
      file_path=file,
      new_model=True,
      num_epochs=train_cfg['num_epochs'],
      gen_epochs=train_cfg['gen_epochs'],
      batch_size=train_cfg['batch_size'],
      train_size=train_cfg['train_size'],
      dropout=train_cfg['dropout'],
      max_gen_length=train_cfg['max_gen_length'],
      validation=train_cfg['validation'],
      is_csv=train_cfg['is_csv'],
      rnn_layers=model_cfg['rnn_layers'],
      rnn_size=model_cfg['rnn_size'],
      rnn_bidirectional=model_cfg['rnn_bidirectional'],
      max_length=model_cfg['max_length'],
      dim_embeddings=model_cfg['dim_embeddings'],
      word_level=model_cfg['word_level'])

In [0]:
! cp word_songs100_config.json ./gdrive/'My Drive'/NLPProject/
! cp word_songs100_vocab.json ./gdrive/'My Drive'/NLPProject/
! cp word_songs100_weights.hdf5 ./gdrive/'My Drive'/NLPProject/

! cp word_songs500_config.json ./gdrive/'My Drive'/NLPProject/
! cp word_songs500_vocab.json ./gdrive/'My Drive'/NLPProject/
! cp word_songs500_weights.hdf5 ./gdrive/'My Drive'/NLPProject/

! cp word_songs1000_config.json ./gdrive/'My Drive'/NLPProject/
! cp word_songs1000_vocab.json ./gdrive/'My Drive'/NLPProject/
! cp word_songs1000_weights.hdf5 ./gdrive/'My Drive'/NLPProject/

! cp word_songs2500_config.json ./gdrive/'My Drive'/NLPProject/
! cp word_songs2500_vocab.json ./gdrive/'My Drive'/NLPProject/
! cp word_songs2500_weights.hdf5 ./gdrive/'My Drive'/NLPProject/