Implementation of 1997 Neural Computation Paper on LSTM

Install Dependencies

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
import torch 
from torch.utils.data import DataLoader, TensorDataset
import math
from sklearn.model_selection import train_test_split

Construct a forget gate: f_t = sigmoid(w_t * concatenation(h_t-1, x_t)), where w_t are the weights, h_t-1 is the hidden state and x_t is the current state
We expect the output to be 
x_t: (batch_size, input_size)
h_prev: (batch_size, hidden_size)
f_t: (batch_size, hidden_size)

In [3]:
class ForgetGate(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        # Weights for the hidden size and input size
        self.W_x = nn.Parameter(torch.randn(hidden_size, input_size) / math.sqrt(input_size))
        # Weights for the hidden state
        self.W_h = nn.Parameter(torch.randn(hidden_size, hidden_size) / math.sqrt(hidden_size))
        # Bias term
        self.b = nn.Parameter(torch.zeros(hidden_size))
    
    def forward(self, x_t, h_prev):
        # Compute the forget gate activation

        # Linear part: x_t * W_x^T + h_prev * W_h^T + b
        z = x_t @ self.W_x.T + h_prev @ self.W_h.T + self.b
        # Gate activation: sigmoid(z)
        f_t = torch.sigmoid(z)
        return f_t

Next we want to create an input gate: i_t = sigmoid(w_i * concate(h_t -1, x_t))
Expected output: 
x_t -> (batch_size, input_size)
h_prev -> (batch_size, hidden_state)
i_t -> (batch_size, hidden_size)

In [4]:
class InputGate(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        # Weights for the hidden size and input size
        self.W_x = nn.Parameter(torch.randn(hidden_size, input_size) / math.sqrt(input_size))
        # Weights for the hidden state
        self.W_h = nn.Parameter(torch.randn(hidden_size, hidden_size) / math.sqrt(hidden_size))
        # Bias term
        self.b = nn.Parameter(torch.zeros(hidden_size))

    def forward(self, x_t, h_prev):
        # Compute the input gate activation

        # Linear part: x_t * W_x^T + h_prev * W_h^T + b
        z = x_t @ self.W_x.T + h_prev @ self.W_h.T + self.b
        # Gate activation: sigmoid(z)
        i_t = torch.sigmoid(z)
        return i_t

New value function

In [5]:
class NewValueFunction(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        # Weights for the hidden size and input size
        self.W_x = nn.Parameter(torch.randn(hidden_size, input_size) / math.sqrt(input_size))
        # Weights for the hidden state
        self.W_h = nn.Parameter(torch.randn(hidden_size, hidden_size) / math.sqrt(hidden_size))
        # Bias term
        self.b = nn.Parameter(torch.zeros(hidden_size))

    def forward(self, x_t, h_prev):
        # Compute the new value function

        # Linear part: x_t * W_x^T + h_prev * W_h^T + b
        z = x_t @ self.W_x.T + h_prev @ self.W_h.T + self.b
        # Gate activation: tanh(z)
        g_t = torch.tanh(z)
        return g_t

Update conveyor belt -> To update the conveyor belt we must use the 3 previous functions 

In [6]:
class ConveyorBeltUpdate(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.forget_gate = ForgetGate(input_size, hidden_size)
        self.input_gate = InputGate(input_size, hidden_size)
        self.new_value = NewValueFunction(input_size, hidden_size)

    def forward(self, x_t, h_prev, c_prev):
        # Update the cell state (conveyor belt)
       
        f_t = self.forget_gate(x_t, h_prev)
        i_t = self.input_gate(x_t, h_prev)
        g_t = self.new_value(x_t, h_prev)

        c_t = f_t * c_prev + i_t * g_t
        return c_t, f_t, i_t, g_t
    

Create output gate

In [7]:
class OutputGate(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size

        # Weights for the hidden size and input size
        self.W_x = nn.Parameter(torch.randn(hidden_size, input_size) / math.sqrt(input_size))
        # Weights for the hidden state
        self.W_h = nn.Parameter(torch.randn(hidden_size, hidden_size) / math.sqrt(hidden_size))
        # Bias term
        self.b = nn.Parameter(torch.zeros(hidden_size))
    
    def forward(self, x_t, h_prev):
        # Compute the output gate activation

        # Linear part: x_t * W_x^T + h_prev * W_h^T + b
        z = x_t @ self.W_x.T + h_prev @ self.W_h.T + self.b
        # Gate activation: sigmoid(z)
        o_t = torch.sigmoid(z)
        return o_t

Create update state class -> This will return the final current c_t and h_t states

In [8]:
class UpdateState(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.updateConveyorBelt = ConveyorBeltUpdate(input_size, hidden_size)
        self.output_gate = OutputGate(input_size, hidden_size)


    def forward(self, x_t, h_prev, c_prev, return_gates: bool=False):
        # Compute the updated hidden state

        # 1 Update cell using conveyor belt update
        c_t, f_t, i_t, g_t = self.updateConveyorBelt(x_t, h_prev, c_prev)

        # 2 Compute output gate
        o_t = self.output_gate(x_t, h_prev)

        # 3 Compute update hidden state
        h_t = o_t * torch.tanh(c_t)

        if return_gates: 
            gates = {"f_t": f_t, "i_t": i_t, "g_t": g_t, "o_t": o_t}
            return h_t, c_t, gates
        else:
            return h_t, c_t

To run this code we would need to iterate over the previous function in order to get a proper output

In [9]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.update_state = UpdateState(input_size, hidden_size)
        self.hidden_size = hidden_size
    
    def forward(self, x):  # x shape: (batch_size, seq_len, input_size)
        batch_size, seq_len, _ = x.size()
        h_t = torch.zeros(batch_size, self.hidden_size)
        c_t = torch.zeros(batch_size, self.hidden_size)
        
        for t in range(seq_len):
            h_t, c_t = self.update_state(x[:, t, :], h_t, c_t)
        return h_t, c_t

          ┌────────┐
x_t ----> │Forget  │--- W_f, U_f, b_f
h_t-1 --> │ Gate   │--- controls what to erase
          └────────┘

x_t ----> ┌────────┐
h_t-1 --> │Input   │--- W_i, U_i, b_i
          │ Gate   │--- controls what to write
          └────────┘

x_t ----> ┌────────────┐
h_t-1 --> │New Value   │--- W_c, U_c, b_c
          │(Candidate) │--- generates new memory
          └────────────┘

x_t ----> ┌────────┐
h_t-1 --> │Output  │--- W_o, U_o, b_o
          │ Gate   │--- controls exposure of memory
          └────────┘

Test if my implementation of LSTM works by making a dataset

In [10]:
import torch
import math

def make_sine_dataset(n_sequences=1000, seq_len=20):
    X = []
    Y = []
    for _ in range(n_sequences):
        # random phase and frequency
        phase = torch.rand(1).item() * 2 * math.pi
        freq  = 0.1 + torch.rand(1).item() * 0.4  # 0.1–0.5
        t = torch.arange(seq_len + 1).float()
        series = torch.sin(freq * t + phase)      # shape: (seq_len+1,)
        X.append(series[:-1].unsqueeze(-1))       # (seq_len, 1)
        Y.append(series[-1])                      # scalar target
    X = torch.stack(X, dim=0)   # (batch, seq_len, 1)
    Y = torch.stack(Y, dim=0)   # (batch,)
    return X, Y

Make the dataset and apply a test train split and dataloaders

In [12]:
seq_len = 20
X, Y = make_sine_dataset(n_sequences=1000, seq_len=seq_len)
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.3, random_state=42)
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, random_state=42)

print("Train set:", X_train.shape, Y_train.shape)
print("Validation set:", X_val.shape, Y_val.shape)
print("Test set:", X_test.shape, Y_test.shape)

Train set: torch.Size([700, 20, 1]) torch.Size([700])
Validation set: torch.Size([150, 20, 1]) torch.Size([150])
Test set: torch.Size([150, 20, 1]) torch.Size([150])


In [16]:
batchsize = 64
train_loader = DataLoader(TensorDataset(X_train, Y_train), batch_size=batchsize, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test, Y_test), batch_size=batchsize)
val_loader = DataLoader(TensorDataset(X_val, Y_val), batch_size=batchsize)

Create an instance of the model and create an optimizer + loss function
Create a training loop of 20 epochs to test the strength of the model

In [17]:
model = LSTM(input_size=1, hidden_size=32)
predictor = nn.Linear(32, 1)

In [18]:
optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=0.001)
loss_fn = nn.MSELoss()

In [None]:
epochs = 20

for epoch in range(epochs):
    model.train()
    train_loss = 0

    for X_batch, Y_batch in train_loader:
        h_T, _ = model(X_batch)
        preds = predictor(h_T).squeeze(-1)

        loss = loss_fn(preds, Y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * X_batch.size(0)

    train_loss /= len(train_loader.dataset)

    # Validation Pass
    model.eval()
    val_loss = 0

    with torch.no_grad():
        for X_batch, Y_batch in val_loader:
            h_T, _ = model(X_batch)
            preds = predictor(h_T).squeeze(-1)
            loss = loss_fn(preds, Y_batch)
            val_loss += loss.item() * X_batch.size(0)

    val_loss /= len(val_loader.dataset)

    print(f"Epoch {epoch+1}/{epochs} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f}")

Epoch 1/20 - Train Loss: 0.4120 - Val Loss: 0.3591
Epoch 2/20 - Train Loss: 0.3109 - Val Loss: 0.2749
Epoch 3/20 - Train Loss: 0.2360 - Val Loss: 0.2104
Epoch 4/20 - Train Loss: 0.1751 - Val Loss: 0.1569
Epoch 5/20 - Train Loss: 0.1264 - Val Loss: 0.1135
Epoch 6/20 - Train Loss: 0.0879 - Val Loss: 0.0824
Epoch 7/20 - Train Loss: 0.0628 - Val Loss: 0.0646
Epoch 8/20 - Train Loss: 0.0480 - Val Loss: 0.0505
Epoch 9/20 - Train Loss: 0.0383 - Val Loss: 0.0405
Epoch 10/20 - Train Loss: 0.0306 - Val Loss: 0.0331
Epoch 11/20 - Train Loss: 0.0248 - Val Loss: 0.0270
Epoch 12/20 - Train Loss: 0.0200 - Val Loss: 0.0219
Epoch 13/20 - Train Loss: 0.0161 - Val Loss: 0.0179
Epoch 14/20 - Train Loss: 0.0131 - Val Loss: 0.0150
Epoch 15/20 - Train Loss: 0.0109 - Val Loss: 0.0126
Epoch 16/20 - Train Loss: 0.0092 - Val Loss: 0.0107
Epoch 17/20 - Train Loss: 0.0078 - Val Loss: 0.0095
Epoch 18/20 - Train Loss: 0.0068 - Val Loss: 0.0079
Epoch 19/20 - Train Loss: 0.0056 - Val Loss: 0.0068
Epoch 20/20 - Train L

Evaluate of the test set

In [20]:
model.eval()
test_loss = 0

with torch.no_grad():
    for X_batch, Y_batch in test_loader:
        h_T, _ = model(X_batch)
        preds = predictor(h_T).squeeze(-1)
        loss = loss_fn(preds, Y_batch)
        test_loss += loss.item() * X_batch.size(0)

test_loss /= len(test_loader.dataset)
print("Final Test MSE:", test_loss)

Final Test MSE: 0.004802806728209059


Compare my LSTM model vs the torch LSTM model using IMBD sentiment analysis dataset

In [24]:
from datasets import load_dataset

imdb = load_dataset("imdb")
train_texts = imdb["train"]["text"]
train_labels = imdb["train"]["label"]
test_texts = imdb["test"]["text"]
test_labels = imdb["test"]["label"]

Generating train split: 100%|██████████| 25000/25000 [00:00<00:00, 717136.86 examples/s]
Generating test split: 100%|██████████| 25000/25000 [00:00<00:00, 1125455.89 examples/s]
Generating unsupervised split: 100%|██████████| 50000/50000 [00:00<00:00, 1202847.16 examples/s]


Create a tokenizer

In [25]:
import re
from collections import Counter
import torch

def basic_tokenize(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9]+", " ", text)
    return text.split()

# Build vocab from training texts
counter = Counter()
for txt in train_texts:
    counter.update(basic_tokenize(txt))

max_vocab_size = 20000
most_common = counter.most_common(max_vocab_size - 2)
itos = ["<pad>", "<unk>"] + [w for w, _ in most_common]
stoi = {w: i for i, w in enumerate(itos)}

pad_idx = stoi["<pad>"]
unk_idx = stoi["<unk>"]

def encode_text(text, max_len=200):
    tokens = basic_tokenize(text)
    ids = [stoi.get(tok, unk_idx) for tok in tokens][:max_len]
    if len(ids) < max_len:
        ids += [pad_idx] * (max_len - len(ids))
    return torch.tensor(ids, dtype=torch.long)

def encode_label(label):
    return torch.tensor(label, dtype=torch.long)  # 0 or 1 in HF IMDB

Build datasets and dataloader for IMDB dataset

In [26]:
max_len = 200

X_train_ids = torch.stack([encode_text(t, max_len) for t in train_texts])
y_train_ids = torch.stack([encode_label(l) for l in train_labels])

X_test_ids  = torch.stack([encode_text(t, max_len) for t in test_texts])
y_test_ids  = torch.stack([encode_label(l) for l in test_labels])

# train / val split
X_train, X_val, y_train, y_val = train_test_split(
    X_train_ids, y_train_ids, test_size=0.2, shuffle=True
)

batch_size = 64
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(TensorDataset(X_val,   y_val),   batch_size=batch_size)
test_loader  = DataLoader(TensorDataset(X_test_ids, y_test_ids), batch_size=batch_size)

Make a custom LSTM sentiment wrapper

In [27]:
class CustomLSTMSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.lstm      = LSTM(input_size=embed_dim, hidden_size=hidden_size)
        self.fc        = nn.Linear(hidden_size, 1)  # binary classification

    def forward(self, x_ids):  # x_ids: (batch, seq_len) of token IDs
        x_emb = self.embedding(x_ids)              # (batch, seq_len, embed_dim)
        h_T, c_T = self.lstm(x_emb)               # (batch, hidden_size)
        logits = self.fc(h_T).squeeze(-1)         # (batch,)
        return logits

In [41]:
vocab_size = len(itos)     # from your vocab list
embed_dim  = 100
hidden_size = 128

pad_idx = stoi["<pad>"]  

model = CustomLSTMSentiment(vocab_size, embed_dim, hidden_size, pad_idx)

In [42]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.BCEWithLogitsLoss()

In [43]:
device = torch.device("cpu")
model.to(device)

CustomLSTMSentiment(
  (embedding): Embedding(20000, 100, padding_idx=0)
  (lstm): LSTM(
    (update_state): UpdateState(
      (updateConveyorBelt): ConveyorBeltUpdate(
        (forget_gate): ForgetGate()
        (input_gate): InputGate()
        (new_value): NewValueFunction()
      )
      (output_gate): OutputGate()
    )
  )
  (fc): Linear(in_features=128, out_features=1, bias=True)
)

In [44]:
def train_one_epoch(model, loader, optimizer, device):
    model.train()
    total_loss = 0.0
    total_correct = 0
    total_count = 0

    for X_batch, y_batch in loader:
        X_batch = X_batch.to(device)             # (batch, seq_len)
        y_batch = y_batch.to(device).float()     # (batch,)

        logits = model(X_batch)                  # (batch,)
        loss = loss_fn(logits, y_batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * X_batch.size(0)

        # predictions
        probs = torch.sigmoid(logits)
        preds = (probs > 0.5).long()
        total_correct += (preds == y_batch.long()).sum().item()
        total_count += X_batch.size(0)

    avg_loss = total_loss / total_count
    avg_acc  = total_correct / total_count
    return avg_loss, avg_acc


def eval_one_epoch(model, loader, device):
    model.eval()
    total_loss = 0.0
    total_correct = 0
    total_count = 0

    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device).float()

            logits = model(X_batch)
            loss = loss_fn(logits, y_batch)

            total_loss += loss.item() * X_batch.size(0)

            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).long()
            total_correct += (preds == y_batch.long()).sum().item()
            total_count += X_batch.size(0)

    avg_loss = total_loss / total_count
    avg_acc  = total_correct / total_count
    return avg_loss, avg_acc

In [45]:
num_epochs = 15 

for epoch in range(1, num_epochs + 1):
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, device)
    val_loss, val_acc     = eval_one_epoch(model, val_loader,   device)

    print(
        f"Epoch {epoch}/{num_epochs} | "
        f"Train Loss: {train_loss:.4f}, Acc: {train_acc:.3f} | "
        f"Val Loss: {val_loss:.4f}, Acc: {val_acc:.3f}"
    )

Epoch 1/15 | Train Loss: 0.6922, Acc: 0.515 | Val Loss: 0.6997, Acc: 0.565
Epoch 2/15 | Train Loss: 0.6725, Acc: 0.563 | Val Loss: 0.7193, Acc: 0.607
Epoch 3/15 | Train Loss: 0.6482, Acc: 0.623 | Val Loss: 0.6799, Acc: 0.589
Epoch 4/15 | Train Loss: 0.5938, Acc: 0.687 | Val Loss: 0.6666, Acc: 0.642
Epoch 5/15 | Train Loss: 0.5651, Acc: 0.716 | Val Loss: 0.6571, Acc: 0.657
Epoch 6/15 | Train Loss: 0.5205, Acc: 0.750 | Val Loss: 0.7354, Acc: 0.619
Epoch 7/15 | Train Loss: 0.4972, Acc: 0.761 | Val Loss: 0.7145, Acc: 0.655
Epoch 8/15 | Train Loss: 0.4744, Acc: 0.769 | Val Loss: 0.7176, Acc: 0.661
Epoch 9/15 | Train Loss: 0.4488, Acc: 0.787 | Val Loss: 0.7333, Acc: 0.672
Epoch 10/15 | Train Loss: 0.4242, Acc: 0.788 | Val Loss: 0.8043, Acc: 0.554
Epoch 11/15 | Train Loss: 0.5055, Acc: 0.727 | Val Loss: 0.7352, Acc: 0.673
Epoch 12/15 | Train Loss: 0.4158, Acc: 0.806 | Val Loss: 0.7392, Acc: 0.690
Epoch 13/15 | Train Loss: 0.4501, Acc: 0.782 | Val Loss: 0.7259, Acc: 0.672
Epoch 14/15 | Train L

Compute test accuracy 

In [46]:
test_loss, test_acc = eval_one_epoch(model, test_loader, device)
print(f"Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.3f}")

Test Loss: 0.6423, Test Acc: 0.694


Create instance of pyTorch LSTM and use the same dataloaders and training loop

In [47]:
import torch
import torch.nn as nn

class TorchLSTMSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.lstm      = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_size,
            batch_first=True      # so input/output shapes are (batch, seq, feat)
        )
        self.fc        = nn.Linear(hidden_size, 1)  # binary classification

    def forward(self, x_ids):  # x_ids: (batch, seq_len)
        # 1. Embed tokens
        x_emb = self.embedding(x_ids)              # (batch, seq_len, embed_dim)

        # 2. LSTM over the sequence
        # output: (batch, seq_len, hidden_size)
        # (h_T, c_T): each (num_layers * num_directions, batch, hidden_size)
        output, (h_T, c_T) = self.lstm(x_emb)

        # 3. Take last layer's hidden state at final time step
        h_T = h_T[-1]                              # (batch, hidden_size)

        # 4. Classifier head → logits
        logits = self.fc(h_T).squeeze(-1)          # (batch,)
        return logits

In [48]:
torch_model = TorchLSTMSentiment(vocab_size, embed_dim, hidden_size, pad_idx).to(device)
loss_fn = nn.BCEWithLogitsLoss()
optimizer_torch = torch.optim.Adam(torch_model.parameters(), lr=1e-3)

In [49]:
num_epochs = 15

for epoch in range(1, num_epochs + 1):
    train_loss, train_acc = train_one_epoch(torch_model, train_loader, optimizer_torch, device)
    val_loss, val_acc     = eval_one_epoch(torch_model, val_loader, device)

    print(
        f"[Torch LSTM] Epoch {epoch}/{num_epochs} | "
        f"Train Loss: {train_loss:.4f}, Acc: {train_acc:.3f} | "
        f"Val Loss: {val_loss:.4f}, Acc: {val_acc:.3f}"
    )

[Torch LSTM] Epoch 1/15 | Train Loss: 0.6924, Acc: 0.511 | Val Loss: 0.6934, Acc: 0.503
[Torch LSTM] Epoch 2/15 | Train Loss: 0.6883, Acc: 0.538 | Val Loss: 0.6929, Acc: 0.517
[Torch LSTM] Epoch 3/15 | Train Loss: 0.6757, Acc: 0.571 | Val Loss: 0.7547, Acc: 0.521
[Torch LSTM] Epoch 4/15 | Train Loss: 0.6502, Acc: 0.610 | Val Loss: 0.6751, Acc: 0.586
[Torch LSTM] Epoch 5/15 | Train Loss: 0.5692, Acc: 0.705 | Val Loss: 0.7141, Acc: 0.538
[Torch LSTM] Epoch 6/15 | Train Loss: 0.5905, Acc: 0.657 | Val Loss: 0.5923, Acc: 0.706
[Torch LSTM] Epoch 7/15 | Train Loss: 0.4713, Acc: 0.788 | Val Loss: 0.5365, Acc: 0.754
[Torch LSTM] Epoch 8/15 | Train Loss: 0.4341, Acc: 0.800 | Val Loss: 0.4951, Acc: 0.790
[Torch LSTM] Epoch 9/15 | Train Loss: 0.3935, Acc: 0.834 | Val Loss: 0.4913, Acc: 0.796
[Torch LSTM] Epoch 10/15 | Train Loss: 0.3080, Acc: 0.881 | Val Loss: 0.4459, Acc: 0.816
[Torch LSTM] Epoch 11/15 | Train Loss: 0.2454, Acc: 0.909 | Val Loss: 0.4391, Acc: 0.825
[Torch LSTM] Epoch 12/15 | Tra

Compare accuracy after training

In [50]:
test_loss, test_acc = eval_one_epoch(torch_model, test_loader, device)
print(f"[Torch LSTM] Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.3f}")

[Torch LSTM] Test Loss: 0.5411, Test Acc: 0.823
