In [1]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [1]:
# Checking to see if directories on my Google Drive are accessible
!ls '/content/gdrive/My Drive/Dylan Lyrics Generator'

 dylan_corpus.txt  'Dylan Lyrics Generator.ipynb'


In [2]:
# Installing PyTorch torchtext
!pip3 install torch torchtext



In [3]:
# Make sure PyTorch and GPU training are available
import torch
print(torch.cuda.is_available())
print(torch.backends.cudnn.version())
print(torch.__version__)

True
7603
1.4.0


In [0]:
# Importing packages
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from collections import Counter
import os
import string
from argparse import Namespace

# File location, parameters of LSTM model, properties of text corpus, etc.
flags = Namespace(train_file='/content/gdrive/My Drive/Dylan Lyrics Generator/dylan_corpus.txt',
                  seq_size=100,
                  batch_size=128,
                  embedding_size=256,
                  lstm_size=256,
                  gradients_norm=5,
                  initial_words=['well', 'i'],
                  predict_top_k=5,
                  checkpoint_path='checkpoint',
                  )

In [0]:
# Load and process text data
def get_data_from_file(train_file, batch_size, seq_size):
  # Read data from file
  with open(train_file, 'r') as f:
    text = f.read()
  
  # Convert text to lowercase, tokenize, and remove punctuation
  text = text.lower().split()
  text = [i.replace(',','').replace('.','').replace('!','').replace('?','').replace(':','') for i in text if i not in string.punctuation]
  
  # Create two dictionaries:
  # 1) int_to_vocab, which converts word tokens into integer indices
  # 2) vocab_to_int, which converts integer indices back to word tokens
  word_counts = Counter(text)
  sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
  int_to_vocab = {k: w for k, w in enumerate(sorted_vocab)}
  vocab_to_int = {w: k for k, w in int_to_vocab.items()}
  n_vocab = len(int_to_vocab)
  
  # Convert word tokens into integer indices
  int_text = [vocab_to_int[w] for w in text]
  num_batches = int(len(int_text) / (seq_size * batch_size))
  in_text = int_text[:num_batches * batch_size * seq_size] # Drop suitable amount of data so mini-batches divide evenly
  
  # Create target data
  # Target of each input word is its consecutive word
  out_text = np.zeros_like(in_text)
  out_text[:-1] = in_text[1:]
  out_text[-1] = in_text[0]

  # Reshape into tensors suitable for LTSM input / training
  in_text = np.reshape(in_text, (batch_size, -1))
  out_text = np.reshape(out_text, (batch_size, -1))
  
  return int_to_vocab, vocab_to_int, n_vocab, in_text, out_text


# Partition data into batches, yielding iterable generators
def get_batches(in_text, out_text, batch_size, seq_size):
  num_batches = np.prod(in_text.shape) // (seq_size * batch_size)
  
  for i in range(0, num_batches * seq_size, seq_size):
    yield in_text[:, i:i+seq_size], out_text[:, i:i+seq_size]

In [6]:
# Summarize vocab size and data parameters
int_to_vocab, vocab_to_int, n_vocab, in_text, out_text = get_data_from_file(train_file=flags.train_file,
                                                                            batch_size=flags.batch_size,
                                                                            seq_size=flags.seq_size)

print('Vocabulary Size: {}'.format(n_vocab))
print('in_text Shape: {}'.format(in_text.shape))
print('out_text Shape: {}'.format(out_text.shape))
print(in_text[:10,:10])
print(out_text[:10,:10])
print('10 Random Words In Vocab: {}'.format([int_to_vocab[i] for i in np.random.randint(0,n_vocab,10)]))

Vocabulary Size: 8781
in_text Shape: (128, 1300)
out_text Shape: (128, 1300)
[[  31    3   26   32  211    3   59    1   21    3]
 [ 278   13  852 1842    2 5582    2  950   19   87]
 [  25 4186    9   61   19  142  179  111 2280 2017]
 [  17 3442   68    0 1856   40 1262   68  255   17]
 [   7    0   65   16   19    0  141   67   15   19]
 [  33   62    0 1717    4 4253    2   11    0  337]
 [ 108   12   65   77 5754  544   76 5755   41    1]
 [   3  214  127   55   17   78   15  302   22    1]
 [ 966    5  881 2078  329  143 1899   46   37 1735]
 [ 116 1480   25  680   74  231   44   14    9    4]]
[[   3   26   32  211    3   59    1   21    3   55]
 [  13  852 1842    2 5582    2  950   19   87 2548]
 [4186    9   61   19  142  179  111 2280 2017    6]
 [3442   68    0 1856   40 1262   68  255   17    0]
 [   0   65   16   19    0  141   67   15   19  156]
 [  62    0 1717    4 4253    2   11    0  337   11]
 [  12   65   77 5754  544   76 5755   41    1  371]
 [ 214  127   55   17

In [0]:
class RNNModule(nn.Module):

  # Set up model architecture, consisting of an embedding layer, a two-layer LSTM with p=0.2 dropout, and a fully connected layer
  def __init__(self, n_vocab, seq_size, embedding_size, lstm_size):
    super(RNNModule, self).__init__()
    self.seq_size = seq_size
    self.embedding_size = embedding_size
    self.lstm_size = lstm_size
    
    self.embedding = nn.Embedding(n_vocab, embedding_size)
    self.lstm = nn.LSTM(embedding_size, lstm_size, batch_first=True, num_layers=2, dropout=0.2)
    self.dense = nn.Linear(lstm_size, n_vocab)
  

  # Constructing the forward pass
  def forward(self, x, prev_state):
    embed = self.embedding(x)
    output, state = self.lstm(embed, prev_state)
    output = self.dense(output)
    
    return output, state
  

  # Helper method that zeros hidden and cell state tensors
  # Called to initialize and reset tensors after each epoch
  def zero_state(self, batch_size):
    return (torch.zeros(2, batch_size, self.lstm_size), torch.zeros(2, batch_size, self.lstm_size))
  
  # Make lyrics predictions given initial words
  def predict(model, words, n_vocab, vocab_to_int, int_to_vocab, output_size, top_k=5):
    # Setting to evaluation mode
    model.eval()
    
    # Initialize hidden and cell state tensors and move to GPU
    state_h, state_c = model.zero_state(1)
    state_h = state_h.cuda()
    state_c = state_c.cuda()
    
    # For each input word, convert to indices and then feed to model
    for w in words:
      ix = torch.tensor([[vocab_to_int[w]]]).cuda()
      output, (state_h, state_c) = model(ix, (state_h, state_c))

    # Return indices of k best output words and then randomly choose one  
    _, top_ix = torch.topk(output[0], k=top_k)
    choices = top_ix.tolist()
    choice = np.random.choice(choices[0])
    words.append(int_to_vocab[choice])

    # Use previously chosen word as initial input for subsequently generated words
    for _ in range(output_size-len(words)):
      ix = torch.tensor([[choice]]).cuda()
      output, (state_h, state_c) = model(ix, (state_h, state_c))
      
      _, top_ix = torch.topk(output[0], k=top_k)
      choices = top_ix.tolist()
      choice = np.random.choice(choices[0])
      words.append(int_to_vocab[choice])

    print(' '.join(words))

In [0]:
# Load and process text data
int_to_vocab, vocab_to_int, n_vocab, in_text, out_text = get_data_from_file(train_file=flags.train_file,
                                                                            batch_size=flags.batch_size,
                                                                            seq_size=flags.seq_size)

# Instantiate LSTM and transfer to GPU
model = RNNModule(n_vocab=n_vocab, seq_size=flags.seq_size, embedding_size=flags.embedding_size, lstm_size=flags.lstm_size)
model = model.cuda()

# Hyperparameters
learning_rate = 1e-3
l2_reg_strength = 3e-4
num_epochs = 50

# Use cross entropy loss and Adam optimizer with L2 regularization
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(params=model.parameters(), lr=learning_rate, weight_decay=l2_reg_strength)

for epoch in range(num_epochs):
  # Load data batches and initialize hidden/cell state tensors
  batches = get_batches(in_text, out_text, flags.batch_size, flags.seq_size)
  state_h, state_c = model.zero_state(flags.batch_size)

  # Transfer data to GPU
  state_h = state_h.cuda()
  state_c = state_c.cuda()
  
  for x, y in batches:    
    # Set to training mode
    model.train()
    
    # Reset all gradients
    optimizer.zero_grad()
    
    # Transfer data to GPU
    x = torch.tensor(x).cuda()
    y = torch.tensor(y).cuda()
    
    # Compute output and loss
    output, (state_h, state_c) = model(x, (state_h, state_c))
    loss = loss_fn(output.transpose(1, 2), y)
    
    # Detach ensures success of backprop
    state_h = state_h.detach()
    state_c = state_c.detach()
    
    # Perform backpropagation
    loss.backward()

    # Gradient clipping helps eschew exploding gradient problem
    _ = torch.nn.utils.clip_grad_norm_(model.parameters(), flags.gradients_norm)
    
    # Update the network's parameters
    optimizer.step()

In [11]:
# Use optimized model to generate Dylan-like lyrics!
RNNModule.predict(model=model, words=['tangled','up','in'],
                  n_vocab=n_vocab, vocab_to_int=vocab_to_int,
                  int_to_vocab=int_to_vocab, output_size=30, top_k=5)

tangled up in my man i don't do it and it don't know i ain't be to serve i know you have to be the way but you can't see
