In [1]:
import torch
import torch.nn as nn
import random

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
class RNN_base(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, activation):
        super().__init__()
        sigma = 0.01
        self.W_xh = nn.Parameter(torch.randn(input_size, hidden_size)*sigma)
        self.W_hh = nn.Parameter(torch.randn(hidden_size, hidden_size)*sigma) 
        self.b_h = nn.Parameter(torch.randn(hidden_size)*sigma)
        self.activation = activation
        self.W_hq = nn.Parameter(torch.randn(hidden_size, output_size)*sigma)
        self.b_q = nn.Parameter(torch.randn(output_size))
    
    def forward(self, X, state=None):
        if state is None:
            state = torch.zeros(X.shape[1], self.W_hh.shape[0], device=X.device)
        outputs = []
        states = []
        for Xt in X:  #X is the input with shape (seq_len, batch_size, input_size)
            state = self.activation(Xt @ self.W_xh + state @ self.W_hh + self.b_h)
            states.append(state)
            output = state @ self.W_hq + self.b_q
            outputs.append(output)
        return outputs, states    

In [43]:
batch_size, num_inputs, num_hiddens, num_steps = 2, 16, 32, 100
X = torch.ones((num_steps, batch_size, num_inputs))
RNN_base1 = RNN_base(X.shape[2], num_hiddens, 1, nn.Tanh())
outputs, states = RNN_base1(X)
print(len(outputs))

100


In [None]:
ds_text = ''
with open('tinyshakespeare.txt', 'r') as f:
    ds_text = f.read()
    ds_text = ds_text.lower() # convert to lowercase to make training easier
print("dataset size:", len(ds_text))


dataset size: 1115394


In [59]:
vocab = sorted(set(ds_text))
print(f"Vocabulary size: {len(vocab)}")
char_to_idx = {char:idx for idx, char in enumerate(vocab)}
idx_to_char = {idx:char for char, idx in char_to_idx.items()}

Vocabulary size: 39


In [66]:
corpus_indices = torch.tensor([char_to_idx[char] for char in ds_text])
corpus_indices

1115394

In [None]:
encoded_text = torch.zeros(len(ds_text), len(vocab))
for i, char in enumerate(ds_text):
    encoded_text[i, char_to_idx[char]] = 1
encoded_text

#Or in an optimized way
#encoded_text = nn.functional.one_hot(corpus_indices).float()    

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.]])

In [76]:
class DataLoader():
    def __init__(self, corpus_indices, encoded_text, seq_length, batch_size):
        self.corpus_indices = corpus_indices
        self.encoded_text = encoded_text
        self.seq_length = seq_length
        self.batch_size = batch_size
    
    def __iter__(self):
        num_examples = (self.encoded_text.shape[0] - 1) // self.seq_length
        #starting points
        example_indices = list(range(0, num_examples * self.seq_length, self.seq_length))
        random.shuffle(example_indices)
        
        for i in range(0, len(example_indices), self.batch_size):
            batch_indices = example_indices[i : i + self.batch_size]
            # if the batch is smaller than the batch size, we drop it
            if len(batch_indices) < self.batch_size:
                continue
            
            X = torch.stack([self.encoded_text[j : j + self.seq_length] for j in batch_indices])
            Y = torch.stack([self.corpus_indices[j + 1 : j + self.seq_length + 1] for j in batch_indices])
            
            # Transpose to (seq_length, batch_size, vocab_size)
            yield X.transpose(0, 1), Y.transpose(0, 1)