<a href="https://colab.research.google.com/github/mitkrieg/dl-assignment-2/blob/main/assignment2_practical.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install wandb
!wandb login

Collecting wandb
  Downloading wandb-0.18.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.14.0-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.9 kB)
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.29,>=1.0.0->wandb)
  Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython!=3.1.29,>=1.0.0->wandb)
  Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)
Downloading wandb-0.18.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_

In [2]:
import torch
from torch import nn
from torch.utils.data import Dataset
from torch import optim
import torch.nn.functional as F
import math
# import wandb

print("------ ACCELERATION INFO -----")
print('CUDA GPU Available:',torch.cuda.is_available())
print('MPS GPU Available:', torch.backends.mps.is_available())
if torch.cuda.is_available():
  device = torch.device('cuda')
  print('GPU Name:',torch.cuda.get_device_name(0))
  print('GPU Count:',torch.cuda.device_count())
  print('GPU Memory Allocated:',torch.cuda.memory_allocated(0))
  print('GPU Memory Cached:',torch.cuda.memory_reserved(0))
# elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
#   device = torch.device('mps')
#   print('Pytorch GPU Build:',torch.backends.mps.is_built())
else:
  device = torch.device('cpu')
  print('Using CPU')

------ ACCELERATION INFO -----
CUDA GPU Available: True
MPS GPU Available: False
GPU Name: Tesla T4
GPU Count: 1
GPU Memory Allocated: 0
GPU Memory Cached: 0


In [3]:
class Vocab:
    def __init__(self, pre_built_dict: dict=None):
        if pre_built_dict:
            self.vocab = pre_built_dict
        else:
            self.vocab = {'<pad>': 0, '<oov>': 1, '<sos>': 2, '<eos>': 3, '<unk>': 4}
        self.idx = len(self.vocab)

    def add_word(self, word: str) -> None:
        if word not in self.vocab:
            self.vocab[word] = self.idx
            self.idx += 1

    def encode(self, tokens: list[str]) -> list[int]:
        return [self.vocab.get(word, self.vocab['<unk>']) for word in tokens]

    def decode(self, indicies: list[int]) -> list[str]:
        return [list(self.vocab.keys())[list(self.vocab.values()).index(idx)] for idx in indicies]

    def __len__(self):
        return len(self.vocab)


class PTBText(Dataset):
    def __init__(self, path: str, vocab: Vocab=Vocab(), build_vocab=True, batch_size=20, seqence_length=20, device=torch.device('cpu')):
        self.path = path
        self.device = device
        self.vocab = vocab
        self.data = self.load_data(build_vocab)
        self.batch_size = batch_size
        self.chunk_size = len(self.data) // batch_size
        self.seq_len = seqence_length
        self.minibatches = self.create_batches()

    def load_data(self, build_vocab):
        data = []
        with open(self.path, 'r') as f:
            count = 0
            for line in f:
                count += 1
                tokens = line.strip().split() + ['<eos>']
                if build_vocab:
                    for token in tokens:
                        self.vocab.add_word(token)

                encoded_line = self.vocab.encode(tokens)
                data.extend(encoded_line)
        return data

    def create_batches(self):
        return [self.data[i*self.chunk_size: (i+1)*self.chunk_size] for i in range(self.batch_size)]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, j):
        inputs = torch.stack([
            torch.LongTensor(self.minibatches[i][j * self.seq_len : (j + 1) * self.seq_len])
            for i in range(self.batch_size)], dim=0)
        labels = torch.stack([
            torch.LongTensor(self.minibatches[i][j * self.seq_len + 1 : (j + 1) * self.seq_len + 1])
            for i in range(self.batch_size)], dim=0)

        return inputs.to(self.device), labels.to(self.device)

    def get_tokens(self, idx):
        return self.data[idx]

    def get_decoded_tokens(self, idx):
        return self.vocab.decode(self.data[idx])


train = PTBText('/content/ptb.train.txt', device=device)
val = PTBText('/content/ptb.valid.txt', vocab=train.vocab, build_vocab=False, device=device)
test = PTBText('/content/ptb.test.txt', vocab=train.vocab, build_vocab=False, device=device)

datasets = {
    'train': train,
    'val': val,
    'test': test
}

print("Vocab size:", len(train.vocab))
print("Train data size:", len(train))
print("Val data size:", len(val))
print("Test data size:", len(test))

Vocab size: 10003
Train data size: 929589
Val data size: 73760
Test data size: 82430


In [4]:
class ZamrembaRNN(nn.Module):
    def __init__(self, rnn_type, vocab_size, batch_size=20, embedding_dim=200, hidden_dim=200, num_layers=2, dropout=0):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn_type = rnn_type
        self.batch_size = batch_size
        if rnn_type == 'lstm':
            self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
        elif rnn_type == 'gru':
            self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
        else:
            raise ValueError("Invalid RNN type: must be 'lstm' or 'gru'")
        self.fc = nn.Linear(hidden_dim, vocab_size)
        if dropout > 0:
            self.dropout = nn.Dropout(dropout)
        else:
            self.dropout = None

        self.init_weights()

    def forward(self, input, hidden):
        output = self.embedding(input)
        if self.dropout is not None:
            output = self.dropout(output)
        output, hidden = self.rnn(output, hidden)
        if self.dropout is not None:
            output = self.dropout(output)
        output = self.fc(output)
        output = F.relu(output)
        return output, hidden

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.embedding.weight, -initrange, initrange)
        nn.init.uniform_(self.rnn.weight_ih_l0, -initrange, initrange)
        nn.init.uniform_(self.rnn.weight_hh_l0, -initrange, initrange)
        nn.init.uniform_(self.fc.weight, -initrange, initrange)

In [28]:
def train_epoch(model, dataset, loss_fn, optimizer, device, epoch, verbosity):
    """Train one epoch of a network"""
    model.train()
    batch_loss = 0

    hidden = (torch.zeros(model.num_layers, model.batch_size, model.hidden_dim).to(device),
              torch.zeros(model.num_layers, model.batch_size, model.hidden_dim).to(device))

    for j in range(dataset.chunk_size // dataset.seq_len):

        inputs, labels = dataset[j]

        optimizer.zero_grad()
        hidden = (hidden[0].detach(), hidden[1].detach())
        outputs, hidden = model(inputs, hidden)
        loss = loss_fn(outputs.view(-1, outputs.shape[-1]), labels.view(-1))
        loss.backward()
        optimizer.step()

        batch_loss += loss.item()
        if (j + 1) % verbosity == 0:
            print(f'Batch #{j + 1} Loss: {batch_loss / verbosity}')
            batch_loss = 0

def perplexity(loss, batches):
    return math.exp(loss / batches)

def evaluate_model(title, model, dataset, loss_fn, seq_len, batch_size, epoch):
    model.eval()
    total_loss = 0
    num_batches = len(dataset) // (batch_size * seq_len)

    hidden = (torch.zeros(model.num_layers, model.batch_size, model.hidden_dim).to(device),
              torch.zeros(model.num_layers, model.batch_size, model.hidden_dim).to(device))

    with torch.no_grad():
        for j in range(num_batches):

            inputs, labels = dataset[j]

            outputs, hidden = model(inputs, hidden)
            loss = loss_fn(outputs.view(-1, outputs.shape[-1]), labels.view(-1))
            total_loss += loss.item()

    perp = perplexity(total_loss, num_batches)
    # wandb.log({
    #         f'{title}-loss': total_loss / num_batches,
    #         f'{title}-perplexity': perp
    #     }, step=epoch)

    print(f'\033[92m{title} perplexity: {perp:.6f} ||| loss {total_loss / num_batches:.6f}\033[0m')

    return perp

def train_network(model, datasets, loss_fn, optimizer, schedule, device, epochs: int, verbosity: int):
    for epoch in range(epochs):
        lr = optimizer.param_groups[0]['lr']

        print(f'----------- Epoch #{epoch + 1}, LR: {lr} ------------')
        train_epoch(model, datasets['train'], loss_fn, optimizer, device, epoch, verbosity)
        train_perplexity = evaluate_model('Train', model, datasets['train'], loss_fn, datasets['train'].seq_len, datasets['train'].batch_size, epoch)
        val_perplexity = evaluate_model('Validation', model, datasets['val'], loss_fn, datasets['train'].seq_len, datasets['train'].batch_size, epoch)
        test_perplexity = evaluate_model('Test', model, datasets['test'], loss_fn, datasets['train'].seq_len, datasets['train'].batch_size, epoch)
        print('------------------------------------\n')

        schedule.step()
    print('----------- Train Complete! ------------')
    return {
        'train':train_perplexity,
        'val':val_perplexity,
        'test':test_perplexity
    }

In [31]:
decay_start = 4
learning_rate_decay = 0.5
momentum=0
lr = 4
dropout_rate = 0

def lr_lambda(epoch):
    if epoch < decay_start:
        return 1
    else:
        return learning_rate_decay ** (epoch - (decay_start-1))

model = ZamrembaRNN('lstm', len(train.vocab)).to(device)
sgd = optim.SGD(model.parameters(), lr=lr)
cross_entropy = nn.CrossEntropyLoss()
schedule = optim.lr_scheduler.LambdaLR(sgd, lr_lambda)

final_metrics = train_network(model, datasets, cross_entropy, sgd, schedule, device, 14, 500)

----------- Epoch #1, LR: 4 ------------
Batch #500 Loss: 7.8949539031982425
Batch #1000 Loss: 7.33743768119812
Batch #1500 Loss: 6.342725848197937
Batch #2000 Loss: 6.069165286064148
[92mTrain perplexity: 356.934628 ||| loss 5.877553[0m
[92mValidation perplexity: 355.698837 ||| loss 5.874084[0m
[92mTest perplexity: 355.935089 ||| loss 5.874748[0m
------------------------------------

----------- Epoch #2, LR: 4 ------------
Batch #500 Loss: 5.798190748214721
Batch #1000 Loss: 5.693178926467896
Batch #1500 Loss: 5.584246485710144
Batch #2000 Loss: 5.454984982490539
[92mTrain perplexity: 189.530829 ||| loss 5.244552[0m
[92mValidation perplexity: 206.213264 ||| loss 5.328911[0m
[92mTest perplexity: 201.679779 ||| loss 5.306681[0m
------------------------------------

----------- Epoch #3, LR: 4 ------------
Batch #500 Loss: 5.210991724967957
Batch #1000 Loss: 5.142019505500794
Batch #1500 Loss: 5.076135538101196
Batch #2000 Loss: 5.027217918395996
[92mTrain perplexity: 144.3

In [30]:
hidden = (torch.zeros(model.num_layers, model.batch_size, model.hidden_dim).to(device),
              torch.zeros(model.num_layers, model.batch_size, model.hidden_dim).to(device))

inputs, labels = train[0]
outputs, hidden = model(inputs, hidden)
loss = cross_entropy(outputs, labels.view(-1))

ValueError: Expected input batch_size (20) to match target batch_size (400).

In [12]:
.shape

torch.Size([400, 10003])

In [11]:
len(train.vocab)

10003