In [6]:
import math
from torch import nn, Tensor
import torch

In [2]:
words = ['the','and','have', 'that', 'for', 'you', 'with', 'say', 'this', 'they', 'but', 'his', 'from', 'not', 'she', 'as', 'what', 'their', 'can', 'who']
print('words',words)

vocab = [chr(i) for i in range(ord('a'), ord('z')+1)]
vocab.append('0')
print('vocab',vocab)

letter_to_index = {letter: index for index, letter in enumerate(vocab)}
index_to_letter = {index: letter for index, letter in enumerate(vocab)}

word_to_index = {word: index for index, word in enumerate(words)}
index_to_word = {index: word for index, word in enumerate(words)}



#print(letter_to_index)
#print(index_to_letter)

#print(word_to_index)
#print(index_to_word)

N = len(words)
# includes special end of word character
V = len(vocab)
# includes end of word character
L = max(len(word) for word in words)+1

words ['the', 'and', 'have', 'that', 'for', 'you', 'with', 'say', 'this', 'they', 'but', 'his', 'from', 'not', 'she', 'as', 'what', 'their', 'can', 'who']
vocab ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0']


In [3]:
# https://github.com/KasperGroesLudvigsen/influenza_transformer/blob/main/positional_encoder.py
class PositionalEncoder(nn.Module):
    """
    The authors of the original transformer paper describe very succinctly what
    the positional encoding layer does and why it is needed:

    "Since our model contains no recurrence and no convolution, in order for the
    model to make use of the order of the sequence, we must inject some
    information about the relative or absolute position of the tokens in the
    sequence." (Vaswani et al, 2017)
    Adapted from:
    https://pytorch.org/tutorials/beginner/transformer_tutorial.html
    """

    def __init__(
        self,
        dropout: float=0.1,
        max_seq_len: int=5000,
        d_model: int=512,
        batch_first: bool=True
        ):

        """
        Parameters:
            dropout: the dropout rate
            max_seq_len: the maximum length of the input sequences
            d_model: The dimension of the output of sub-layers in the model
                     (Vaswani et al, 2017)
        """

        super().__init__()

        self.d_model = d_model

        self.dropout = nn.Dropout(p=dropout)

        self.batch_first = batch_first

        # adapted from PyTorch tutorial
        position = torch.arange(max_seq_len).unsqueeze(1)

        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))

        if self.batch_first:
            pe = torch.zeros(1, max_seq_len, d_model)

            pe[0, :, 0::2] = torch.sin(position * div_term)

            pe[0, :, 1::2] = torch.cos(position * div_term)
        else:
            pe = torch.zeros(max_seq_len, 1, d_model)

            pe[:, 0, 0::2] = torch.sin(position * div_term)

            pe[:, 0, 1::2] = torch.cos(position * div_term)

        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [batch_size, enc_seq_len, dim_val] or
               [enc_seq_len, batch_size, dim_val]
        """
        if self.batch_first:
            x = x + self.pe[:,:x.size(1)]
        else:
            x = x + self.pe[:x.size(0)]

        return self.dropout(x)

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, num_layers=1, max_seq_len=L):
        super(RNN, self).__init__()
        self.input_size = input_size
        self.pos_encoder = PositionalEncoder(d_model = input_size, dropout=0.1, max_seq_len=max_seq_len)
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, num_layers,batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # Initialize hidden state with zeros
        # h0 = torch.zeros(1, x.size(0), self.hidden_size).to(x.device)

        #h0 can be improved (h0 = x*toen_embedding)

        # Forward pass through RNN
        #out, _ = self.rnn(x, h0)
        x = x * math.sqrt(self.input_size)
        x = self.pos_encoder(x)
        out, _ = self.rnn(x)

        # Concatenate the output of RNN with y
        #out = torch.cat((out[:, -1, :], y.unsqueeze(1)), dim=1)
        # out = out[:, -1, :]

        # Pass the concatenated output through the fully connected layer
        out = self.fc(out)

        return out

In [8]:
# Define the dimensions
input_size = N
hidden_size = V
num_classes = V
num_layers = 1

# Create an instance of the RNN model
model = RNN(input_size, hidden_size, num_layers=num_layers, num_classes=num_classes)

# Print the model architecture
print(model)

# Prepare training set
X_train = []
Y_train = []

for word in words:

    chars = list(word)
    x = torch.zeros(1, L, N)
    y = torch.zeros(1, L)

    n = len(chars)
    for i in range(L):
        x[0, i, word_to_index[word]] = 1
        if i < n:
            y[0,i] = letter_to_index[chars[i]]
        else:
            y[0,i] = V-1

    X_train.append(x)
    Y_train.append(y)

# Convert the training set to tensors
X_train = torch.cat(X_train, dim=0)
Y_train = torch.cat(Y_train, dim=0)

# Print the shape of the training set
print("X_train shape:", X_train.shape)
print("Y_train shape:", Y_train.shape)
# prepare training set

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


# Train the model
num_epochs = 1000
batch_size = 5

for epoch in range(num_epochs):
    # Shuffle the training data
    indices = torch.randperm(X_train.size(0))
    X_train_shuffled = X_train[indices]
    Y_train_shuffled = Y_train[indices]

    # Split the training data into batches
    num_batches = X_train.size(0) // batch_size
    for batch_idx in range(num_batches):
        # Get the batch inputs and targets
        start_idx = batch_idx * batch_size
        end_idx = start_idx + batch_size
        batch_inputs = X_train_shuffled[start_idx:end_idx]
        batch_targets = Y_train_shuffled[start_idx:end_idx]

        # Forward pass
        outputs = model(batch_inputs)

        # Compute the loss
        loss = criterion(outputs.view(-1, num_classes), batch_targets.view(-1).long())

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Print the loss for this epoch
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}")

RNN(
  (pos_encoder): PositionalEncoder(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (rnn): RNN(20, 27, batch_first=True)
  (fc): Linear(in_features=27, out_features=27, bias=True)
)
X_train shape: torch.Size([20, 6, 20])
Y_train shape: torch.Size([20, 6])
Epoch [1/1000], Loss: 3.172258138656616
Epoch [2/1000], Loss: 3.123988151550293
Epoch [3/1000], Loss: 2.985718011856079
Epoch [4/1000], Loss: 2.8663907051086426
Epoch [5/1000], Loss: 2.693377733230591
Epoch [6/1000], Loss: 2.7134902477264404
Epoch [7/1000], Loss: 2.45755672454834
Epoch [8/1000], Loss: 2.428528308868408
Epoch [9/1000], Loss: 2.397883415222168
Epoch [10/1000], Loss: 2.4141294956207275
Epoch [11/1000], Loss: 2.3476438522338867
Epoch [12/1000], Loss: 2.208467721939087
Epoch [13/1000], Loss: 2.197744607925415
Epoch [14/1000], Loss: 2.2037367820739746
Epoch [15/1000], Loss: 2.093555212020874
Epoch [16/1000], Loss: 1.9138263463974
Epoch [17/1000], Loss: 2.0887386798858643
Epoch [18/1000], Loss: 1.9434293508529663
Epo

In [None]:
# test on some inputs -- is perfect memory achieved

In [18]:
test_word_ids= list(range(0, N))
X_test = X_train[test_word_ids]

model.eval()
with torch.no_grad():
    outputs = model(X_test)

# Convert predictions to words
predicted_words = []
for output in outputs:
    predicted_word = ''
    for i in range(L):
        index = torch.argmax(output[i])
        if index == V-1:
            break
        predicted_word += index_to_letter[index.item()]
    predicted_words.append(predicted_word)


for i in range(len(test_word_ids)):
  w = words[test_word_ids[i]]
  wh = predicted_words[i]
  print(w,wh)

the the
and and
have have
that that
for for
you you
with with
say say
this this
they they
but but
his his
from from
not not
she she
as as
what what
their their
can can
who who


In [19]:
torch.save(model.state_dict(), "./detok_toy_trained_model.pth")

In [22]:
# Get the weights of the RNN model
rnn_weights = model.rnn.state_dict()

# Visualize the weights
for name, weight in rnn_weights.items():
    print(f"Layer: {name}")
    print()

# Visualize the weights
for name, weight in rnn_weights.items():
    print(f"Layer: {name}")
    print(weight.shape)
    print()


Layer: weight_ih_l0

Layer: weight_hh_l0

Layer: bias_ih_l0

Layer: bias_hh_l0

Layer: weight_ih_l0
torch.Size([27, 20])

Layer: weight_hh_l0
torch.Size([27, 27])

Layer: bias_ih_l0
torch.Size([27])

Layer: bias_hh_l0
torch.Size([27])

