<a href="https://colab.research.google.com/github/mitkrieg/dl-assignment-2/blob/main/assignment2_practical.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [34]:
!pip install wandb
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [41]:
import torch
from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import math
from nltk.tokenize import RegexpTokenizer
import wandb

print("------ ACCELERATION INFO -----")
print('CUDA GPU Available:',torch.cuda.is_available())
print('MPS GPU Available:', torch.backends.mps.is_available())
if torch.cuda.is_available():
  device = torch.device('cuda')
  print('GPU Name:',torch.cuda.get_device_name(0))
  print('GPU Count:',torch.cuda.device_count())
  print('GPU Memory Allocated:',torch.cuda.memory_allocated(0))
  print('GPU Memory Cached:',torch.cuda.memory_reserved(0))
# elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
#   device = torch.device('mps')
#   print('Pytorch GPU Build:',torch.backends.mps.is_built())
else:
  device = torch.device('cpu')
  print('Using CPU')

------ ACCELERATION INFO -----
CUDA GPU Available: True
MPS GPU Available: False
GPU Name: Tesla T4
GPU Count: 1
GPU Memory Allocated: 74159616
GPU Memory Cached: 236978176


## Define PTBText Dataset

In [36]:
class PTBText(Dataset):
    def __init__(self, filename, sequence_len, prior_vocab=None,device=torch.device('cpu')) -> None:
        super().__init__()
        self.tokenized_text = []
        self.data = []
        self.labels = []
        self.device = device
        self.tokenizer = RegexpTokenizer(r'<unk>|<pad>|<oov>|<sos>|<eos>|\w+').tokenize
        self.seq_len = sequence_len
        self.max_len = 0
        if prior_vocab:
            self.vocab = prior_vocab
        else:
            self.vocab = {'<pad>':0,'<oov>':1,'<sos>':2,'<eos>':3,'<unk>':4}

        with open(filename, 'r') as f:
            for line in f:
                tokens = self.tokenizer(line)

                #only build new vocab if prior vocab is not given
                if prior_vocab is None:
                    idx = len(self.vocab)
                    for word in tokens:
                        if word not in self.vocab:
                            self.vocab[word] = idx
                            idx += 1

                self.tokenized_text.append(['<sos>'] + tokens + ['<eos>'])
                self.max_len = max(self.max_len, len(tokens) + 2)


        self.encoded_text = [self.encode_text(x, pad=True) for x in self.tokenized_text]

        #build sequences
        for tokens in self.tokenized_text:
            for i in range(len(tokens) - self.seq_len):
                self.data.append(tokens[i:i+self.seq_len])
                self.labels.append(tokens[i+self.seq_len])
        self.encoded_labels = [self.vocab.get(x,1) for x in self.labels]
        self.encoded_data = [self.encode_text(x) for x in self.data]

    def encode_text(self, tokens: list[str], pad=False):
        encoded = []
        for word in tokens:
            encoded.append(self.vocab.get(word,1))

        if pad and len(encoded) < self.max_len:
            encoded.extend([0]* (self.max_len - len(encoded)))
        elif len(encoded) < self.seq_len:
            encoded.extend([0]*(self.seq_len - len(encoded)))

        return encoded

    def resequence_data(self, seqence_len):
        self.seq_len = seqence_len
        self.data = []
        self.labels = []
        for tokens in self.tokenized_text:
            for i in range(len(tokens) - self.seq_len):
                self.data.append(tokens[i:i+self.seq_len])
                self.labels.append(tokens[i+self.seq_len])

        self.encoded_labels = [self.vocab.get(x,1) for x in self.labels]
        self.encoded_data = [self.encode_text(x) for x in self.data]



    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return torch.tensor(self.encoded_data[index]).to(self.device), torch.tensor(self.encoded_labels[index]).to(self.device)

    def get_tokens(self, index):
        return self.tokenized_text[index]

    def get_encoded_tokens(self, index):
        return self.encoded_text[index]

    def get_sequence(self, index):
        return self.data[index], self.labels[index]

    def get_encoded_sequence(self, index):
        return self.__getitem__(index)

### Load Data & Create Data loaders

In [38]:
train = PTBText('/content/ptb.train.txt', 5)
val = PTBText('/content/ptb.valid.txt', 5, prior_vocab=train.vocab)
test = PTBText('/content/ptb.test.txt', 5, prior_vocab=train.vocab)

gen = torch.Generator().manual_seed(123)
batch_size = 128
train_loader = DataLoader(train, batch_size=batch_size, shuffle=True, generator=gen)
val_loader = DataLoader(val, batch_size=batch_size, shuffle=True, generator=gen)
test_loader = DataLoader(test, batch_size=batch_size, shuffle=True, generator=gen)

dataloaders = {
    'train':train_loader,
    'val':val_loader,
    'test':test_loader
}

print('Training vocab size:', len(train.vocab))
print('Training sample raw: ', train.get_sequence(100))
print('Training sample encoded:',train[100])

Training vocab size: 9648
Training sample raw:  (['is', 'unusually', '<unk>', 'once', 'it'], 'enters')
Training sample encoded: (tensor([45, 91,  4, 68, 84]), tensor(92))


## Define LSTM Model

In [39]:
class ZarembaRNN(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_units=200, num_lstm_layers=2, dropout_rate= 0) -> None:
        super().__init__()
        self.hidden_units = hidden_units
        self.vocab_size = vocab_size
        self.embed_size = embedding_size
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size,hidden_units, num_lstm_layers, batch_first=True)
        self.fc = nn.Linear(hidden_units, vocab_size)
        self.dropout_rate = dropout_rate
        if self.dropout_rate > 0:
            self.dropout = nn.Dropout(self.dropout_rate)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        if self.dropout_rate > 0:
            x = self.dropout(x[:, -1, :])
            x = self.fc(x)
        else:
            x = self.fc(x[:, -1, :])


        return x



In [42]:
def train_epoch(network, dataloader, loss_fn, optimizer, device, epoch, verbosity: int):
    """Train one epoch of a network"""

    network.train()
    batch_loss = 0

    # iterate over all batches
    for i, data in enumerate(dataloader):
        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = network(inputs)

        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        batch_loss += loss.item()
        if i % verbosity == verbosity - 1:
            print(f'Batch #{i + 1} Loss: {batch_loss / verbosity}')
            batch_loss = 0

def perplexity(loss, batches):
    return math.exp(loss / batches)

def eval_network(title, network, dataloader, loss_fn, epoch):
    """Evaluate model and log metrics to wandb"""

    network.eval()
    correct = 0
    total = 0
    loss = 0

    with torch.no_grad():
        for data in dataloader:
            data, labels = data
            data = data.to(device)
            labels = labels.to(device)
            outputs = network(data)

            _, predicted = torch.max(outputs.data, 1)

            loss += loss_fn(outputs, labels)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        perp = perplexity(loss, len(dataloader))
        wandb.log({
            f'{title}-loss': loss / len(dataloader),
            f'{title}-perplexity': perp
        }, step=epoch)

    print(f'\033[92m{title} perplexity: {perp:.6f} ||| loss {loss / len(dataloader):.6f}\033[0m')
    return perp

def train_network(network, dataloaders, loss_fn, optimizer, schedule, device, epochs: int, verbosity: int):
    for epoch in range(epochs):
        lr = optimizer.param_groups[0]['lr']
        print(f'----------- Epoch #{epoch + 1}, LR: {lr} ------------')
        train_epoch(network, dataloaders['train'], loss_fn, optimizer, device, epoch, verbosity)
        train_perplexity = eval_network('Train', network, dataloaders['train'], loss_fn, epoch)
        val_perplexity = eval_network('Validation', network, dataloaders['val'], loss_fn, epoch)
        test_perplexity = eval_network('Test', network, dataloaders['test'], loss_fn, epoch)
        print('------------------------------------\n')

        schedule.step()
    print('----------- Train Complete! ------------')
    return {
        'train':train_perplexity,
        'val':val_perplexity,
        'test':test_perplexity
    }


In [49]:
def lr_lambda(epoch):
    if epoch < 7:
        return 1
    else:
        return 0.5 ** (epoch - 6)

model = ZarembaRNN(len(train.vocab), 10)
model = model.to(device)
cross_entropy = nn.CrossEntropyLoss()
adam = optim.Adam(model.parameters(), lr=1e-2)
schedule = optim.lr_scheduler.LambdaLR(adam, lr_lambda)
# device = torch.device('cpu')

In [50]:
run = wandb.init(project="dl-assignment2")
results = train_network(model, dataloaders, cross_entropy, adam, schedule, device, 14, 1000)
run.finish()

VBox(children=(Label(value='0.016 MB of 0.016 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Test-loss,█▅▄▃▂▂▂▁▁▁▁
Test-perplexity,█▅▃▂▂▂▁▁▁▁▁
Train-loss,█▆▅▄▃▂▂▁▁▁▁
Train-perplexity,█▅▄▃▂▂▂▁▁▁▁
Validation-loss,█▅▃▂▂▂▂▁▁▁▁
Validation-perplexity,█▄▃▂▂▁▂▁▁▁▁

0,1
Test-loss,5.2165
Test-perplexity,184.28748
Train-loss,4.77569
Train-perplexity,118.5919
Validation-loss,5.34356
Validation-perplexity,209.25638


----------- Epoch #1, LR: 0.01 ------------
Batch #1000 Loss: 6.403616511821747
Batch #2000 Loss: 5.946282475471497
Batch #3000 Loss: 5.719841088294983
Batch #4000 Loss: 5.598230762481689
Batch #5000 Loss: 5.515763603687287
[92mTrain perplexity: 195.687745 ||| loss 5.276520[0m
[92mValidation perplexity: 228.820556 ||| loss 5.432938[0m
[92mTest perplexity: 215.637667 ||| loss 5.373600[0m
------------------------------------

----------- Epoch #2, LR: 0.01 ------------
Batch #1000 Loss: 5.2413937554359435
Batch #2000 Loss: 5.222617111682892
Batch #3000 Loss: 5.204514781475067
Batch #4000 Loss: 5.176063044071197
Batch #5000 Loss: 5.147251231193542
[92mTrain perplexity: 140.384227 ||| loss 4.944383[0m
[92mValidation perplexity: 190.958044 ||| loss 5.252054[0m
[92mTest perplexity: 178.278382 ||| loss 5.183346[0m
------------------------------------

----------- Epoch #3, LR: 0.01 ------------
Batch #1000 Loss: 4.9371884202957155
Batch #2000 Loss: 4.933050584793091
Batch #3000 Lo

VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Test-loss,▆▂▁▁▁▂▃▃▆▆▇███
Test-perplexity,▆▂▁▁▁▂▃▃▅▆▇███
Train-loss,█▆▅▄▄▃▃▂▂▁▁▁▁▁
Train-perplexity,█▅▄▃▃▃▂▂▁▁▁▁▁▁
Validation-loss,▅▁▁▁▂▂▃▄▆▆▇███
Validation-perplexity,▅▁▁▁▂▂▃▃▅▆▇███

0,1
Test-loss,5.43118
Test-perplexity,228.41797
Train-loss,3.98669
Train-perplexity,53.87611
Validation-loss,5.57927
Validation-perplexity,264.87726
