In [1]:
from data_rnn import load_ndfa, load_brackets
from data_prep import pad_and_convert
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

In [2]:
x_train_ndfa, (i2w_ndfa, w2i_ndfa) = load_ndfa(n=150_000)
x_train_brackets, (i2w_brackets, w2i_brackets) = load_brackets(n=150_000)


In [3]:
print(''.join([i2w_ndfa[i] for i in x_train_ndfa[50]]))


ss


In [4]:
print(''.join([i2w_brackets[i] for i in x_train_brackets[10_000]]))


()


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [8]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, emb_size, h, num_char, n_layers=1):
        super(LSTM, self).__init__()

        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.lstm = nn.LSTM(input_size=emb_size, hidden_size=h, num_layers=n_layers, batch_first=True)
        self.fc = nn.Linear(h, num_char)
    
    def forward(self, input_seq, h):
        embedded = self.embedding(input_seq)
        lstm_out, hidden = self.lstm(embedded, h)
        lstm_out = lstm_out[:, -1, :]
        output = self.fc(lstm_out)
        return output, hidden

In [9]:
def pad_and_convert2(batch, w2i):
    start_token = w2i['.start']
    end_token = w2i['.end']
    
    batch = [[start_token] + x + [end_token] for x in batch]
    
    max_len = max([len(x) for x in batch])
    
    batch = [x + [w2i['.pad']] * (max_len - len(x)) for x in batch]
    
    batch = torch.tensor(batch, dtype=torch.long)
    
    return batch

In [10]:
len(x_train_ndfa[-1])

158

In [12]:
len(x_train_ndfa[0])

2

In [13]:
x_train_ndfa_padded = pad_and_convert(x_train_ndfa)
x_train_brackets_padded = pad_and_convert(x_train_brackets)

In [14]:
len(x_train_brackets_padded[-1])

1022

pad_and_convert maakt erg lange sequences, moet 158 + .start + .end niet de langst zijn?

Heb een andere functie maar hij doet nog steeds dezelfde loss (0)

In [15]:
x_train_ndfa_padded2 = pad_and_convert2(x_train_ndfa, w2i_ndfa)
x_train_brackets_padded2 = pad_and_convert2(x_train_brackets, w2i_brackets)

In [16]:
len(x_train_ndfa_padded2[-1])

160

In [17]:
len(x_train_ndfa_padded2[0])


160

In [18]:
x_train_ndfa_padded[:10]

tensor([[4, 4, 0,  ..., 0, 0, 0],
        [4, 4, 0,  ..., 0, 0, 0],
        [4, 4, 0,  ..., 0, 0, 0],
        ...,
        [4, 4, 0,  ..., 0, 0, 0],
        [4, 4, 0,  ..., 0, 0, 0],
        [4, 4, 0,  ..., 0, 0, 0]])

In [19]:
max_len = max([len(x) for x in x_train_ndfa])
max_len

158

In [20]:
x_train = x_train_ndfa_padded[:, :-1]
y_train = x_train_ndfa_padded[:, -1]

In [21]:
print(type(x_train_ndfa))

<class 'list'>


In [22]:
model = LSTM(vocab_size=len(set(w2i_ndfa)), emb_size=300, h=300, num_char=len(set(w2i_ndfa)), n_layers=1)

In [23]:
vocab_size = len(set(w2i_ndfa))
emb_size = 300
h = 300
num_char = vocab_size
n_layers = 1

In [24]:
num_epochs = 1
learning_rate = 0.001

In [25]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [26]:
dataset_ndfa2 = TensorDataset(x_train_ndfa_padded2)
dataloader_ndfa2 = DataLoader(dataset_ndfa2, batch_size=10, shuffle=True)

In [27]:
import torch.distributions as dist
def sample(lnprobs, temperature=1.0): 
    """
    Sample an element from a categorical distribution
    :param lnprobs: Outcome logits
    :param temperature: Sampling temperature. 1.0 follows the given
        distribution, 0.0 returns the maximum probability element. :return: The index of the sampled element.
    """
    if temperature == 0.0:
        return lnprobs.argmax()
    p = F.softmax(lnprobs / temperature, dim=0)
    cd = dist.Categorical(p)
    return cd.sample()

In [28]:
max_length = 50
for epoch in range(num_epochs):
    total_loss = 0.0

    for batch_idx, (inputs,) in enumerate(dataloader_ndfa2):
        print(f'Batch Index: {batch_idx}, Batch Size: {inputs.size(0)}')
        model.train()        
        optimizer.zero_grad()

        # Autoregressive training
        h = None
        for t in range(inputs.size(1) - 1):
            input_seq = inputs[:, :t+1]
            target_word = inputs[:, t+1]

            output, _ = model(input_seq, h)

            loss = criterion(output, target_word)  

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f'Epoch [{epoch+1}/{num_epochs}], Iteration [{batch_idx+1}/{len(dataloader_ndfa2)}], Loss: {loss.item():.4f}')

        model.eval()
        seed_seq = [w2i_ndfa['.start'], w2i_ndfa['('], w2i_ndfa['('], w2i_ndfa[')']]
        seed_input = torch.tensor([seed_seq], dtype=torch.long)
        with torch.no_grad():
            for t in range(max_length - 1):
                output, _ = model(seed_input, h)
                next_token = sample(output[:, -1, :])
                seed_seq.append(next_token.item())

                if next_token == w2i_ndfa['.end']:
                    break

                seed_input = torch.tensor([[next_token]], dtype=torch.long)
            
            generated_sequence = [''.join(i2w_ndfa[i] for i in seed_seq)]
            print(f'Generated Sequence after epoch {epoch+1}: {generated_sequence}')

    average_loss = total_loss / len(dataloader_ndfa2.dataset)
    print(f'Epoch [{epoch+1}/{num_epochs}], Average Loss: {average_loss:.4f}')

torch.save(model.state_dict(), 'lstm_model.pth')

Batch Index: 0, Batch Size: 10
Epoch [1/1], Iteration [1/15000], Loss: 0.0000


KeyError: '('