## Deep Learning (Andrew Jaeyong Choi Prof.)
### Lab. LSTM

- Gyuyeon Lim (202334734, Department of Computer Engineering at Gachon Univ.)
- lky473736@gmail.com

------

<br>

### Previous Provided Code

The original RNN model implements a simple recurrent neural network for next-character prediction in English words. The architecture consists of a single RNN layer with 26 hidden units, matching the number of alphabet characters. The model takes one-hot encoded characters as input and predicts the next character in the sequence. During training, it processes 50 five-letter English words for 900 epochs using Adam optimizer with a learning rate of 0.001 and a step scheduler that reduces the learning rate by a factor of 0.1 every 300 epochs. The RNN maintains a hidden state that captures sequential dependencies, but due to the vanishing gradient problem inherent in vanilla RNNs, it struggles with longer-term dependencies. The model uses CrossEntropyLoss for training and evaluates performance by comparing predicted characters with ground truth at each position, calculating both final character accuracy and overall character-by-character accuracy across all test words.

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

chars = "abcdefghijklmnopqrstuvwxyz"
char_list = [i for i in chars]
n_letters = len(char_list)

n_layers = 1

five_words = ['basic','beach','below','black','brown','carry','cream','drink','error','event','exist','first',
              'funny','guess','human','image','large','magic','mouse','night','noise','ocean','often','order',
              'peace','phone','print','quiet','reach','rough','round','scene','score','sense','skill','sleep',
              'small','storm','table','think','touch','twice','until','upset','voice','waste','watch','white','woman','young']
n_five_words = len(five_words)

sequence_length = 4

def word_to_onehot(string):
    one_hot = np.array([]).reshape(0,n_letters)
    for i in string:
      idx = char_list.index(i)
      zero = np.zeros(shape=n_letters, dtype=int)
      zero[idx] = 1
      one_hot = np.vstack([one_hot, zero])
    return one_hot

def onehot_to_word(onehot_1):
    onehot = torch.Tensor.numpy(onehot_1)
    return char_list[onehot.argmax()]

# Use RNN Packages
class myRNN(nn.Module):
  def __init__(self, input_size, hidden_size, num_layer):
    super(myRNN,  self).__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.num_layer = num_layer

    self.rnn = nn.RNN(input_size = input_size,hidden_size=hidden_size, num_layers=num_layer)

  def forward(self, x, hidden):
    out, hidden = self.rnn(x, hidden)
    return out, hidden

  def init_hidden(self):
    return torch.zeros(self.num_layer, 1, self.hidden_size)

# Lists to store loss and accuracy
rnn_losses = []
rnn_word_accuracies = []

def main():
  n_hidden = 26
  lr = 0.001
  epochs = 900

  model = myRNN(n_letters, n_hidden, n_layers)

  loss_func = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=lr)
  scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 300, gamma=0.1)

  for i in range(epochs):
    total_loss = 0
    for j in range(n_five_words):
      hidden = model.init_hidden()
      string = five_words[j]
      one_hot = torch.from_numpy(word_to_onehot(string)).type_as(torch.FloatTensor())
      model.zero_grad()
      hidden = model.init_hidden()
      input = one_hot[0:-1]
      input = torch.unsqueeze(input, 1)
      target = np.argmax(one_hot[1:], axis=1)

      output, hidden  = model(input, hidden)

      loss = loss_func(output.squeeze(1), target)
      loss.backward()
      optimizer.step()
      total_loss += loss.item()

    # Store average loss for this epoch
    avg_loss = total_loss / n_five_words
    rnn_losses.append(avg_loss)

    if i%10 == 0:
      print('epoch%d'%i)
      print(f'Loss: {avg_loss:.4f}')

    scheduler.step()

  torch.save(model.state_dict(), 'trained_rnn.pth')
  model.load_state_dict(torch.load('trained_rnn.pth'))

  with torch.no_grad():
    total = 0
    positive = 0
    total_text = 0
    positive_text = 0
    word_acc_list = []

    for i in range(n_five_words):
      string = five_words[i]
      one_hot = torch.from_numpy(word_to_onehot(string)).type_as(torch.FloatTensor())
      hidden = model.init_hidden()
      input = one_hot[0:-1]
      input = torch.unsqueeze(input, 1)
      target = np.argmax(one_hot[1:], axis=1)
      output, hidden = model(input, hidden)
      output = output.squeeze()

      output_string = string[0]
      word_correct_chars = 0

      for j in range(output.size()[0]):
        output_string += onehot_to_word(output[j].data)
        total_text += 1

        if string[j+1] == output_string[-1]:
          positive_text += 1
          word_correct_chars += 1

      # Calculate accuracy for this word
      word_accuracy = word_correct_chars / len(string[1:])
      word_acc_list.append(word_accuracy)

      total += 1
      if string[-1] == output_string[-1]:
        positive += 1

      print('%d GT:%s OUT:%s ACC:%.3f'%(i+1, string, output_string, word_accuracy))

    # Store word accuracies
    rnn_word_accuracies = word_acc_list

    print('final text accuracy %d/%d (%.4f)'%(positive, total, positive/total))
    print('whole text accuracy %d/%d (%.4f)' % (positive_text, total_text, positive_text / total_text))

if __name__ == '__main__':
  main()

epoch0
Loss: 3.2184
epoch10
Loss: 2.7669
epoch20
Loss: 2.6014
epoch30
Loss: 2.4702
epoch40
Loss: 2.3670
epoch50
Loss: 2.2834
epoch60
Loss: 2.2147
epoch70
Loss: 2.1603
epoch80
Loss: 2.1166
epoch90
Loss: 2.0801
epoch100
Loss: 2.0490
epoch110
Loss: 2.0216
epoch120
Loss: 1.9966
epoch130
Loss: 1.9738
epoch140
Loss: 1.9533
epoch150
Loss: 1.9349
epoch160
Loss: 1.9184
epoch170
Loss: 1.9034
epoch180
Loss: 1.8899
epoch190
Loss: 1.8774
epoch200
Loss: 1.8660
epoch210
Loss: 1.8559
epoch220
Loss: 1.8462
epoch230
Loss: 1.8377
epoch240
Loss: 1.8301
epoch250
Loss: 1.8227
epoch260
Loss: 1.8160
epoch270
Loss: 1.8096
epoch280
Loss: 1.8039
epoch290
Loss: 1.7982
epoch300
Loss: 1.7926
epoch310
Loss: 1.7901
epoch320
Loss: 1.7895
epoch330
Loss: 1.7889
epoch340
Loss: 1.7883
epoch350
Loss: 1.7877
epoch360
Loss: 1.7871
epoch370
Loss: 1.7864
epoch380
Loss: 1.7858
epoch390
Loss: 1.7852
epoch400
Loss: 1.7845
epoch410
Loss: 1.7839
epoch420
Loss: 1.7833
epoch430
Loss: 1.7827
epoch440
Loss: 1.7820
epoch450
Loss: 1.7814

------

### Improvement using LSTM

The improved LSTM model addresses the limitations of the vanilla RNN by incorporating Long Short-Term Memory units, which are specifically designed to handle long-term dependencies through gating mechanisms. The architecture is significantly enhanced with three LSTM layers instead of one, and the hidden size is increased from 26 to 64 units, providing greater model capacity. A dropout rate of 0.2 is applied between LSTM layers to prevent overfitting, and an additional fully connected layer maps the LSTM output back to the vocabulary size. The training process is extended to 1500 epochs with gradient clipping to prevent gradient explosion, weight decay for regularization, and a modified learning rate scheduler that reduces the rate by half every 500 epochs. And using CUDA, it will do more fast than before. The LSTM's cell state and hidden state allow it to selectively remember and forget information over longer sequences, making it more effective at capturing character-level patterns and dependencies. This results in improved accuracy for both individual character predictions and complete word reconstruction compared to the vanilla RNN approach.


In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

chars = "abcdefghijklmnopqrstuvwxyz"
char_list = [i for i in chars]
n_letters = len(char_list)

n_layers = 3

five_words = ['basic','beach','below','black','brown','carry','cream','drink','error','event',
              'exist','first','funny','guess','human','image','large','magic','mouse','night',
              'noise','ocean','often','order','peace','phone','print','quiet','reach','rough',
              'round','scene','score','sense','skill','sleep','small','storm','table','think',
              'touch','twice','until','upset','voice','waste','watch','white','woman','young']
n_five_words = len(five_words)

sequence_length = 4

def word_to_onehot(string):
    one_hot = np.array([]).reshape(0,n_letters)
    for i in string:
        idx = char_list.index(i)
        zero = np.zeros(shape=n_letters, dtype=int)
        zero[idx] = 1
        one_hot = np.vstack([one_hot, zero])
    return one_hot

def onehot_to_word(onehot_1):
    onehot = torch.Tensor.numpy(onehot_1)
    return char_list[onehot.argmax()]

class myLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layer, dropout=0.2):
        super(myLSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layer = num_layer

        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layer,
            dropout=dropout if num_layer > 1 else 0,
            batch_first=True
        )

        self.fc = nn.Linear(hidden_size, input_size)

    def forward(self, x, hidden):
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out)
        return out, hidden

    def init_hidden(self):
        h0 = torch.zeros(self.num_layer, 1, self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layer, 1, self.hidden_size).to(device)
        return (h0, c0)

# Lists to store loss and accuracy
lstm_losses = []
lstm_word_accuracies = []

def main():
    n_hidden = 64
    lr = 0.001
    epochs = 1500

    model = myLSTM(n_letters, n_hidden, n_layers).to(device)

    loss_func = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)

    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=500, gamma=0.5)

    print("Training started...")

    for i in range(epochs):
        total_loss = 0
        for j in range(n_five_words):
            string = five_words[j]
            one_hot = torch.from_numpy(word_to_onehot(string)).type_as(torch.FloatTensor()).to(device)
            model.zero_grad()

            input = one_hot[0:-1]
            input = input.unsqueeze(0)
            hidden = model.init_hidden()
            target = torch.argmax(one_hot[1:], dim=1).long().to(device)

            output, hidden = model(input, hidden)

            loss = loss_func(output.squeeze(0), target)
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()
            total_loss += loss.item()

        # Store average loss for this epoch
        avg_loss = total_loss / n_five_words
        lstm_losses.append(avg_loss)

        if i % 50 == 0:
            print('Epoch %d, Average Loss: %.4f' % (i, avg_loss))

        scheduler.step()

    print("Training completed!")
    torch.save(model.state_dict(), 'trained_lstm.pth')
    model.load_state_dict(torch.load('trained_lstm.pth'))

    print("\nEvaluation:")
    with torch.no_grad():
        total = 0
        positive = 0
        total_text = 0
        positive_text = 0
        word_acc_list = []

        for i in range(n_five_words):
            string = five_words[i]
            one_hot = torch.from_numpy(word_to_onehot(string)).type_as(torch.FloatTensor()).to(device)

            input = one_hot[0:-1]
            input = input.unsqueeze(0)
            hidden = model.init_hidden()
            target = torch.argmax(one_hot[1:], dim=1).cpu().numpy()

            output, hidden = model(input, hidden)
            output = output.squeeze()

            output_string = string[0]
            word_correct_chars = 0

            for j in range(output.size()[0]):
                output_string += onehot_to_word(output[j].cpu().data)
                total_text += 1

                if string[j+1] == output_string[-1]:
                    positive_text += 1
                    word_correct_chars += 1

            # Calculate accuracy for this word
            word_accuracy = word_correct_chars / len(string[1:])
            word_acc_list.append(word_accuracy)

            total += 1
            if string[-1] == output_string[-1]:
                positive += 1

            print('%d GT:%s OUT:%s ACC:%.3f'%(i+1, string, output_string, word_accuracy))

        # Store word accuracies
        lstm_word_accuracies = word_acc_list

        print('final text accuracy %d/%d (%.4f)'%(positive, total, positive/total))
        print('whole text accuracy %d/%d (%.4f)' % (positive_text, total_text, positive_text / total_text))

if __name__ == '__main__':
    main()

Using device: cuda
Training started...
Epoch 0, Average Loss: 3.1648
Epoch 50, Average Loss: 1.3907
Epoch 100, Average Loss: 0.6308
Epoch 150, Average Loss: 0.4195
Epoch 200, Average Loss: 0.3245
Epoch 250, Average Loss: 0.3040
Epoch 300, Average Loss: 0.2878
Epoch 350, Average Loss: 0.2890
Epoch 400, Average Loss: 0.2932
Epoch 450, Average Loss: 0.2924
Epoch 500, Average Loss: 0.2900
Epoch 550, Average Loss: 0.2823
Epoch 600, Average Loss: 0.2753
Epoch 650, Average Loss: 0.2896
Epoch 700, Average Loss: 0.2795
Epoch 750, Average Loss: 0.2818
Epoch 800, Average Loss: 0.2758
Epoch 850, Average Loss: 0.2753
Epoch 900, Average Loss: 0.2926
Epoch 950, Average Loss: 0.2782
Epoch 1000, Average Loss: 0.2649
Epoch 1050, Average Loss: 0.2781
Epoch 1100, Average Loss: 0.2900
Epoch 1150, Average Loss: 0.2789
Epoch 1200, Average Loss: 0.2770
Epoch 1250, Average Loss: 0.2830
Epoch 1300, Average Loss: 0.2639
Epoch 1350, Average Loss: 0.2659
Epoch 1400, Average Loss: 0.2642
Epoch 1450, Average Loss: 0