In [16]:
import torch
from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import math
from nltk.tokenize import RegexpTokenizer
import wandb

## Define PTBText Dataset

In [2]:
class PTBText(Dataset):
    def __init__(self, filename, sequence_len, prior_vocab=None) -> None:
        super().__init__()
        self.tokenized_text = []
        self.data = []
        self.labels = []
        self.tokenizer = RegexpTokenizer(r'<unk>|<pad>|<oov>|<sos>|<eos>|\w+').tokenize
        self.seq_len = sequence_len
        self.max_len = 0
        if prior_vocab:
            self.vocab = prior_vocab
        else:
            self.vocab = {'<pad>':0,'<oov>':1,'<sos>':2,'<eos>':3,'<unk>':4}
        
        with open(filename, 'r') as f:
            for line in f:
                tokens = self.tokenizer(line)

                #only build new vocab if prior vocab is not given
                if prior_vocab is None:
                    idx = len(self.vocab)
                    for word in tokens:
                        if word not in self.vocab:
                            self.vocab[word] = idx
                            idx += 1
                
                self.tokenized_text.append(['<sos>'] + tokens + ['<eos>'])
                self.max_len = max(self.max_len, len(tokens) + 2)
        

        self.encoded_text = [self.encode_text(x, pad=True) for x in self.tokenized_text]

        #build sequences
        for tokens in self.tokenized_text:
            for i in range(len(tokens) - self.seq_len):
                self.data.append(tokens[i:i+self.seq_len])
                self.labels.append(tokens[i+self.seq_len])
        self.encoded_labels = [self.vocab.get(x,1) for x in self.labels]
        self.encoded_data = [self.encode_text(x) for x in self.data]

    def encode_text(self, tokens: list[str], pad=False):
        encoded = []
        for word in tokens:
            encoded.append(self.vocab.get(word,1))

        if pad and len(encoded) < self.max_len:
            encoded.extend([0]* (self.max_len - len(encoded)))
        elif len(encoded) < self.seq_len:
            encoded.extend([0]*(self.seq_len - len(encoded)))

        return encoded
    
    def resequence_data(self, seqence_len):
        self.seq_len = seqence_len
        self.data = []
        self.labels = []
        for tokens in self.tokenized_text:
            for i in range(len(tokens) - self.seq_len):
                self.data.append(tokens[i:i+self.seq_len])
                self.labels.append(tokens[i+self.seq_len])

        self.encoded_labels = [self.vocab.get(x,1) for x in self.labels]
        self.encoded_data = [self.encode_text(x) for x in self.data]


    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return torch.tensor(self.encoded_data[index]), torch.tensor(self.encoded_labels[index])
    
    def get_tokens(self, index):
        return self.tokenized_text[index]
    
    def get_encoded_tokens(self, index):
        return self.encoded_text[index]
    
    def get_sequence(self, index):
        return self.data[index], self.labels[index]
    
    def get_encoded_sequence(self, index):
        return self.__getitem__(index)

### Load Data & Create Data loaders

In [81]:
train = PTBText('./data/ptb.train.txt', 5)
val = PTBText('./data/ptb.valid.txt', 5, prior_vocab=train.vocab)
test = PTBText('./data/ptb.test.txt', 5, prior_vocab=train.vocab)

gen = torch.Generator().manual_seed(123)
batch_size = 128
train_loader = DataLoader(train, batch_size=batch_size, shuffle=True, generator=gen)
val_loader = DataLoader(val, batch_size=batch_size, shuffle=True, generator=gen)
test_loader = DataLoader(test, batch_size=batch_size, shuffle=True, generator=gen)

dataloaders = {
    'train':train_loader,
    'val':val_loader,
    'test':test_loader
}

print('Training vocab size:', len(train.vocab))
print('Training sample raw: ', train.get_sequence(100))
print('Training sample encoded:',train[100])

Training vocab size: 9648
Training sample raw:  (['is', 'unusually', '<unk>', 'once', 'it'], 'enters')
Training sample encoded: (tensor([45, 91,  4, 68, 84]), tensor(92))


## Define LSTM Model

In [72]:
class ZarembaRNN(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_units=200, num_lstm_layers=2, dropout_rate= 0) -> None:
        super().__init__()
        self.hidden_units = hidden_units
        self.vocab_size = vocab_size
        self.embed_size = embedding_size
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size,hidden_units, num_lstm_layers, batch_first=True)
        self.fc = nn.Linear(hidden_units, vocab_size)
        self.dropout_rate = dropout_rate 
        if self.dropout_rate > 0:
            self.dropout = nn.Dropout(self.dropout_rate)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        if self.dropout_rate > 0:
            x = self.dropout(x[:, -1, :])
            x = self.fc(x)
        else:
            x = self.fc(x[:, -1, :])


        return x
            
    

In [93]:
def train_epoch(network, dataloader, loss_fn, optimizer, device, epoch, verbosity: int):
    """Train one epoch of a network"""

    network.train()
    batch_loss = 0

    # iterate over all batches
    for i, data in enumerate(dataloader):
        inputs, labels = data

        optimizer.zero_grad()
        outputs = network(inputs)
        
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        batch_loss += loss.item()
        if i % verbosity == verbosity - 1:
            print(f'Batch #{i + 1} Loss: {batch_loss / verbosity}')
            batch_loss = 0
    
def perplexity(loss, batches):
    return math.exp(loss / batches)

def eval_network(title, network, dataloader, loss_fn, epoch):
    """Evaluate model and log metrics to wandb"""

    network.eval()
    correct = 0
    total = 0
    loss = 0

    with torch.no_grad():
        for data in dataloader:
            data, labels = data
            outputs = network(data)

            _, predicted = torch.max(outputs.data, 1)

            loss += loss_fn(outputs, labels)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        perp = perplexity(loss, len(dataloader))
        wandb.log({
            f'{title}-loss': loss / len(dataloader),
            f'{title}-perplexity': perp
        }, step=epoch)
  
    print(f'\033[92m{title} perplexity: {perp:.6f} ||| loss {loss / len(dataloader):.6f}\033[0m')
    return perp

def train_network(network, dataloaders, loss_fn, optimizer, schedule, device, epochs: int, verbosity: int):
    for epoch in range(epochs):
        print(f'----------- Epoch #{epoch + 1}, LR: {optimizer.param_groups[0]['lr']} ------------')
        train_epoch(network, dataloaders['train'], loss_fn, optimizer, device, epoch, verbosity)
        train_perplexity = eval_network('Train', network, dataloaders['train'], loss_fn, epoch)
        val_perplexity = eval_network('Validation', network, dataloaders['val'], loss_fn, epoch)
        test_perplexity = eval_network('Test', network, dataloaders['test'], loss_fn, epoch)
        print('------------------------------------\n')

        schedule.step()
    print('----------- Train Complete! ------------')
    return {
        'train':train_perplexity,
        'val':val_perplexity,
        'test':test_perplexity
    }
        

In [97]:
def lr_lambda(epoch):
    if epoch < 4:
        return .1
    else:
        return 0.5 ** (epoch - 3)

model = ZarembaRNN(len(train.vocab), 10)
cross_entropy = nn.CrossEntropyLoss()
adam = optim.Adam(model.parameters(), lr=1)
schedule = optim.lr_scheduler.LambdaLR(adam, lr_lambda)
device = torch.device('cpu')

In [96]:
run = wandb.init(project="dl-assignment2")
results = train_network(model, dataloaders, cross_entropy, adam, schedule, device, 13, 1000)
run.finish()

0,1
Test-loss,▁▇█
Train-loss,▁██
Validation-loss,▁▇█

0,1
Test-loss,257.97849
Test-perplexity,1.0930311322468985e+112
Train-loss,256.31937
Train-perplexity,2.080110740121543e+111
Validation-loss,263.00842
Validation-perplexity,1.671501376252282e+114


----------- Epoch #1, LR: 0.1 ------------
Batch #1000 Loss: 8.146729045391083
Batch #2000 Loss: 6.6877510733604435
Batch #3000 Loss: 6.6735848937034605
Batch #4000 Loss: 6.673708264350891
Batch #5000 Loss: 6.655115474700928
[92mTrain perplexity: 709.837322 ||| loss 6.565036[0m
[92mValidation perplexity: 724.363467 ||| loss 6.585293[0m
[92mTest perplexity: 672.669305 ||| loss 6.511254[0m
------------------------------------

----------- Epoch #2, LR: 0.1 ------------
Batch #1000 Loss: 6.576430155754089
Batch #2000 Loss: 6.560266708374024


In [47]:
torch.max(torch.rand([32, 5, 9648]), dim=1)

torch.return_types.max(
values=tensor([[0.7332, 0.9473, 0.8291,  ..., 0.6799, 0.8324, 0.8936],
        [0.9341, 0.6996, 0.9489,  ..., 0.7854, 0.8351, 0.8760],
        [0.8873, 0.9887, 0.8474,  ..., 0.7111, 0.5906, 0.6415],
        ...,
        [0.7961, 0.7227, 0.9180,  ..., 0.9373, 0.6811, 0.9479],
        [0.7929, 0.6888, 0.9234,  ..., 0.8842, 0.8649, 0.9375],
        [0.8669, 0.9894, 0.7400,  ..., 0.9723, 0.9387, 0.8379]]),
indices=tensor([[1, 4, 0,  ..., 2, 1, 3],
        [4, 1, 4,  ..., 0, 0, 2],
        [4, 1, 2,  ..., 2, 2, 1],
        ...,
        [4, 3, 2,  ..., 4, 1, 1],
        [1, 0, 2,  ..., 1, 1, 0],
        [3, 1, 4,  ..., 4, 3, 1]]))

In [77]:
f'test {.325434365478: .6}'

'test  0.325434'

In [90]:
for i, data in enumerate(train_loader):
    inputs, labels = data

    outputs = model(inputs)
    print(outputs.shape)
    loss = cross_entropy(outputs, labels)
    print(labels.shape)
    break

torch.Size([128, 9648])
torch.Size([128])


In [21]:
F.softmax(torch.tensor([.45,.436,.85,.76,.31]), 0)

tensor([0.1751, 0.1727, 0.2612, 0.2387, 0.1522])

In [26]:
a,b = torch.max(torch.tensor([1,2,3,4,5]), 0)
print(a)
print(b)

tensor(5)
tensor(4)


In [27]:
torch.max(torch.tensor([1,2,3,4,5]), 0)

torch.return_types.max(
values=tensor(5),
indices=tensor(4))