This project is to use recurrent neural networks, LSTMs, GRUs and Pytorch sequence-to-sequence capabilities for text prediction.

In [2]:
import unidecode
import string
import random
import re
import pdb
 
all_characters = string.printable
n_characters = len(all_characters)
 
file = unidecode.unidecode(open('./text_files/tiny_shakespeare.txt').read())
file_len = len(file)
print('file_len =', file_len)

file_len = 1115394


In [4]:
string.printable

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

In [90]:
chunk_len = 200

#Code for getting a random chunk from a specific text
def random_chunk():
    start_index = random.randint(0, file_len - chunk_len)
    end_index = start_index + chunk_len + 1
    return file[start_index:end_index]
 
print(random_chunk())

 so she yields to me;
For I am rough and woo not like a babe.

BAPTISTA:
Well mayst thou woo, and happy be thy speed!
But be thou arm'd for some unhappy words.

PETRUCHIO:
Ay, to the proof; as mountain


In [91]:
import torch
from torch.autograd import Variable

# Turn string into list of longs
def char_tensor(string):
    tensor = torch.zeros(len(string)).long()
    for c in range(len(string)):
        tensor[c] = all_characters.index(string[c])
    return Variable(tensor)
 
print(char_tensor('abcDEF'))

tensor([10, 11, 12, 39, 40, 41])


In [92]:
#Get a random set of a larger text corpus to train on
def random_training_set():    
    chunk = random_chunk()
    inp = char_tensor(chunk[:-1])
    target = char_tensor(chunk[1:])
    return inp, target

In [93]:
def evaluate(prime_str='A', predict_len=100, temperature=0.8):
    ## initialize hidden variable, initialize other useful variables 
    predicted = prime_str
    hidden = decoder.init_hidden()  
    ## /
 
    prime_input = char_tensor(prime_str)
 
    # Use priming string to "build up" hidden state
    for p in range(len(prime_str) - 1):
        _, hidden = decoder(prime_input[p], hidden)
    inp = prime_input[-1]
 
    for c in range(predict_len):
        output, hidden =  decoder(inp, hidden) 
 
        # Sample from the network as a multinomial distribution
        output_dist = output.data.view(-1).div(temperature).exp()
        top_i = torch.multinomial(output_dist, 1)[0]
 
        ## get character from your list of all characters, add it to your output str sequence, set input
        ## for the next pass through the model
        letter=all_characters[top_i.item()]
        predicted += letter
        inp= top_i
 
    return predicted

In [94]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
 
#The RNN model 
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers=1):
        super(RNN, self).__init__()
        # encode using embedding layer
        # set up GRU passing in number of layers parameter (nn.GRU)
        # decode output
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
 
    def forward(self, input_char, hidden):
        output = self.embedding(input_char).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden
        
        # by reviewing the documentation, construct a forward function that properly uses the output
        # of the GRU
        # return output and hidden

    def init_hidden(self):
        return Variable(torch.zeros(self.n_layers, 1, self.hidden_size))
    
        
def train(inp, target):
    decoder_optimizer.zero_grad()
    hidden = decoder.init_hidden()       

    # initialize hidden layers, set up gradient and loss
    loss = 0
    for c in range(chunk_len):
        output, hidden = decoder(inp[c], hidden)# run the forward pass of your rnn with proper input
        loss += criterion(output, target[c].unsqueeze(0))
 
    # calculate backwards loss and step the optimizer (globaly)
    loss.backward()
    decoder_optimizer.step()          
    return loss.item() / chunk_len
    
    
import time
n_epochs = 2000
print_every = 100
plot_every = 10
hidden_size = 100
n_layers = 1
lr = 0.005
 
#Make the decoder, its optimizer, and loss function
decoder = RNN(n_characters, hidden_size, n_characters, n_layers)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
 
start = time.time()
all_losses = []
loss_avg = 0
 
#Run the epoch for the amount you want to do
for epoch in range(1, n_epochs + 1):
    loss_ = train(*random_training_set())       
    loss_avg += loss_
     
    #See what our text predictor predicts every 100 epochs
    if epoch % print_every == 0:
        print('[%s (%d %d%%) %.4f]' % (time.time() - start, epoch, epoch / n_epochs * 100, loss_))
        print(evaluate('Wh', 100), '\n')
     
    #Get the average loss
    if epoch % plot_every == 0:
        all_losses.append(loss_avg / plot_every)
        loss_avg = 0 


[12.037445306777954 (100 5%) 2.5389]
Whit in y oth
 thoud noad nour the heast men the thisal Hof dond Eifess, darthen is cbertheseseses you 

[22.40361452102661 (200 10%) 2.1421]
Wh four in wour hee nougler?
Foort I dint so; you put soucs as het net forpis the ame sing; ous my thi 

[34.800031423568726 (300 15%) 2.0377]
Whe the
To thnouss stened, to cil' ond and movere.
un Ment in susle dat the the leangese tued morires  

[49.21166801452637 (400 20%) 2.0454]
Whave to Cord well conos, is thou the thent
And thent ear my so there and shie call felld semut wich t 

[63.78067445755005 (500 25%) 1.9059]
Whe to so me to but the a neet?

ORIANIES:
Mace for lore my fards I dave spire sey!

LUMETSE:
Ke has s 

[79.25118589401245 (600 30%) 2.0035]
Whe in his ners is your me and that in as still he rearringardy.


CHAROLO:
Riot him recas in hour he  

[93.5791826248169 (700 35%) 1.8882]
What greand of the so ever haugrey
What wo dreas me degreast be there yee chood go not my goress, shmo 

[108.3