In [None]:
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
#List of all possible characters
CHARS = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" \
        + "!\"#$%&\'()*+,-./:;—<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c"

#Will contain raw characters from the corpus
corpus = []
with open('shakespeare.txt', 'r') as f:
    for line in f:
        for char in line.strip():
            corpus.append(char)
        corpus.append('\n')

print("Total number of characters:", len(corpus))
print("\n\n")
print("First 100 characters:\n")
print(corpus[:100])

In [None]:
#Map from character to its corresponding index
char2idx = {char : i for i, char in enumerate(CHARS)}
#Map from index to its corresponding character
idx2char = {i : char for i, char in enumerate(CHARS)}

NUM_CHARS = len(char2idx)
print("Total number of distinct chars:", NUM_CHARS)

In [None]:
#Corpus but with indices as opposed to characters
corpus_with_indices = [char2idx[char] for char in corpus]

print("Corpus with indices:")
print(corpus_with_indices[:100])

SIZE_OF_SNIPPET = 250
#Dataset will contain 2000 random 250 character blocks from corpus
dataset = []
for _ in range(2000):
    
    snipped_start = np.random.randint(0, len(corpus_with_indices) - SIZE_OF_SNIPPET)
    snipped = corpus_with_indices[snipped_start:snipped_start + SIZE_OF_SNIPPET]
    
    dataset.append((
        torch.LongTensor(snipped[:-1]),
        torch.LongTensor(snipped[1:])
    ))

print("\nSize of dataset:", len(dataset))

X = torch.stack([xy[0] for xy in dataset])
Y = torch.stack([xy[1] for xy in dataset])

In [None]:
#Define model
class ShakespeareGenerator(nn.Module):

    def __init__(self, embedding_size, hidden_size):

        super().__init__()

        #Size of embedding used to represent characters
        self.embedding_size = embedding_size
        
        #Size of hidden and cell state within LSTM
        self.hidden_size = hidden_size

        #Embedding module: Maps character indices to dense vector representations
        self.embedding = nn.Embedding(
            num_embeddings=NUM_CHARS,
            embedding_dim=self.embedding_size
        )
        
        #LSTM module to be used for character generation
        self.lstm = nn.LSTM(
            input_size=self.embedding_size,
            hidden_size=self.hidden_size
        )
        
        #Linear mapping to be used to go from LSTM outputs to character predictions
        self.linear = nn.Linear(
            in_features=self.hidden_size,
            out_features=NUM_CHARS
        )


    def forward(self, batched_inputs):
        #Number of character blocks to be considered simultaneously
        batch_size = batched_inputs.shape[1]
        #Hidden and Cell state initialized to all ones
        h, c = self.get_initial_hc(batch_size)
        #Character block length
        seq_len = batched_inputs.shape[0]

        #Embeddings from raw character inputs
        embeddings = self.embedding(batched_inputs)
        
        #Outputs and final state of LSTM after processing embedddings
        outputs, (h, c) = self.lstm(
                embeddings.reshape(seq_len, batch_size, self.embedding_size),
                (h, c)
        )
        
        #Use linear mapping to map LSTM outputs to character predictions
        outputs = self.linear(torch.squeeze(outputs))

        #Return outputs and final state
        return outputs, (h, c)


    def get_initial_hc(self, batch_size):

        return (torch.zeros(1, batch_size, self.hidden_size),
                torch.zeros(1, batch_size, self.hidden_size))


    def generate(self, initial_token=' ', num_tokens=100, temperature=1):
        
        with torch.no_grad():
            
            #Index of current character initialized to initial character
            token = torch.LongTensor([char2idx[initial_token]])
            #state of LSTM initialized to all ones
            h, c = self.get_initial_hc(1)
            #To contain predicted characters in a list
            chars = []
            
            for _ in range(num_tokens):
                
                #Add current character to list
                chars.append(idx2char[token.item()])
                
                #Use embedding of current character as input
                inp = self.embedding(token)
                
                #Pass current embedding through LSTM and get output and new state
                out, (h, c) = self.lstm(inp.reshape(1, 1, self.embedding_size), (h, c))
                
                #Distribution of possible character predictions based on output
                dist = self.linear(out.reshape(1, -1))
                
                #Temperature controls variation of distribution.  High temperature implies
                #likely characters are made more likely.  Low temperature increases chances
                #of less likely characters
                dist = dist.data.view(-1).div(temperature).exp()
                
                #Sample character from distribution
                chosen_i = torch.multinomial(dist, 1)[0]
                
                #Update current character
                token = torch.LongTensor([chosen_i])
                
            #Join elements of list into single string    
            return ''.join(chars[1:])


In [None]:
#Training of this model takes a long time
#For Demo we will used pretrained weights
EPOCHS = 500
LR = 0.1
BETA = 0.8
EMBEDDING_SIZE = 100
HIDDEN_SIZE = 64

USE_PRETRAINED = True

net = ShakespeareGenerator(EMBEDDING_SIZE, HIDDEN_SIZE).float()

#Softmax Cross Entropy Loss used 
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=LR, momentum=BETA)

if USE_PRETRAINED:
    net.load_state_dict(torch.load('shakespeare.pt', map_location=lambda storage, loc: storage))
    
else:
    for _ in range(EPOCHS):

        output, _ = net(X.transpose(0, 1))
        output = output.transpose(0, 1)

        loss = criterion(output.reshape(-1, NUM_CHARS), Y.reshape(-1))

        print(loss.item())
        net.zero_grad()
        loss.backward()
        optimizer.step()
    

In [None]:
#Temperature=1 Probability Distribution used as predicted.
print(net.generate(temperature=1, num_tokens=1000))

In [None]:
#Temperature=1.5 more likely characters are used more often
print(net.generate(temperature=1.5, num_tokens=1000))

In [None]:
#Temperature=0.25 Less likely characters are used more often.
print(net.generate(temperature=0.25, num_tokens=1000))