<a href="https://colab.research.google.com/github/mitkrieg/dl-assignment-2/blob/main/assignment2_practical.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
!pip install wandb
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mmitkrieger[0m ([33mmitkrieger-cornell-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [28]:
import torch
from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import math
from nltk.tokenize import RegexpTokenizer
import wandb

print("------ ACCELERATION INFO -----")
print('CUDA GPU Available:',torch.cuda.is_available())
print('MPS GPU Available:', torch.backends.mps.is_available())
if torch.cuda.is_available():
  device = torch.device('cuda')
  print('GPU Name:',torch.cuda.get_device_name(0))
  print('GPU Count:',torch.cuda.device_count())
  print('GPU Memory Allocated:',torch.cuda.memory_allocated(0))
  print('GPU Memory Cached:',torch.cuda.memory_reserved(0))
# elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
#   device = torch.device('mps')
#   print('Pytorch GPU Build:',torch.backends.mps.is_built())
else:
  device = torch.device('cpu')
  print('Using CPU')

------ ACCELERATION INFO -----
CUDA GPU Available: True
MPS GPU Available: False
GPU Name: Tesla T4
GPU Count: 1
GPU Memory Allocated: 247080448
GPU Memory Cached: 339738624


## Define PTBText Dataset

In [29]:
class PTBText(Dataset):
    def __init__(self, filename, sequence_len, prior_vocab=None,device=torch.device('cpu')) -> None:
        super().__init__()
        self.tokenized_text = []
        self.data = []
        self.labels = []
        self.device = device
        self.tokenizer = RegexpTokenizer(r'<unk>|<pad>|<oov>|<sos>|<eos>|\w+').tokenize
        self.seq_len = sequence_len
        self.max_len = 0
        if prior_vocab:
            self.vocab = prior_vocab
        else:
            self.vocab = {'<pad>':0,'<oov>':1,'<sos>':2,'<eos>':3,'<unk>':4}

        with open(filename, 'r') as f:
            for line in f:
                tokens = self.tokenizer(line)

                #only build new vocab if prior vocab is not given
                if prior_vocab is None:
                    idx = len(self.vocab)
                    for word in tokens:
                        if word not in self.vocab:
                            self.vocab[word] = idx
                            idx += 1

                self.tokenized_text.append(tokens + ['<eos>'])
                self.max_len = max(self.max_len, len(tokens) + 2)


        self.encoded_text = [self.encode_text(x, pad=True) for x in self.tokenized_text]

        #build sequences
        for tokens in self.tokenized_text:
            for i in range(len(tokens) - self.seq_len):
                self.data.append(tokens[i:i+self.seq_len])
                self.labels.append(tokens[i+1:i+self.seq_len+1])
        self.encoded_labels = [self.encode_text(x) for x in self.labels]
        self.encoded_data = [self.encode_text(x) for x in self.data]

    def encode_text(self, tokens: list[str], pad=False):
        encoded = []
        for word in tokens:
            encoded.append(self.vocab.get(word,1))

        if pad and len(encoded) < self.max_len:
            encoded.extend([0]* (self.max_len - len(encoded)))
        elif len(encoded) < self.seq_len:
            encoded.extend([0]*(self.seq_len - len(encoded)))

        return encoded

    def resequence_data(self, seqence_len):
        self.seq_len = seqence_len
        self.data = []
        self.labels = []
        for tokens in self.tokenized_text:
            for i in range(len(tokens) - self.seq_len):
                self.data.append(tokens[i:i+self.seq_len])
                self.labels.append(tokens[i+self.seq_len])

        self.encoded_labels = [self.vocab.get(x,1) for x in self.labels]
        self.encoded_data = [self.encode_text(x) for x in self.data]



    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return torch.tensor(self.encoded_data[index]).to(self.device), torch.tensor(self.encoded_labels[index]).to(self.device)

    def get_tokens(self, index):
        return self.tokenized_text[index]

    def get_encoded_tokens(self, index):
        return self.encoded_text[index]

    def get_sequence(self, index):
        return self.data[index], self.labels[index]

    def get_encoded_sequence(self, index):
        return self.__getitem__(index)

### Load Data & Create Data loaders

In [75]:
seq_len = 20
train = PTBText('/content/ptb.train.txt', seq_len)
val = PTBText('/content/ptb.valid.txt', seq_len, prior_vocab=train.vocab)
test = PTBText('/content/ptb.test.txt', seq_len, prior_vocab=train.vocab)

gen = torch.Generator().manual_seed(123)
batch_size = 128
train_loader = DataLoader(train, batch_size=batch_size, shuffle=True, generator=gen)
val_loader = DataLoader(val, batch_size=batch_size, shuffle=True, generator=gen)
test_loader = DataLoader(test, batch_size=batch_size, shuffle=True, generator=gen)

dataloaders = {
    'train':train_loader,
    'val':val_loader,
    'test':test_loader
}

print('Training vocab size:', len(train.vocab))
print('Training sample raw: ', train.get_sequence(100))
print('Training sample encoded:',train[100])

Training vocab size: 9648
Training sample raw:  (['of', 'asbestos', 'including', '<unk>', 'more', '<unk>', 'than', 'the', 'common', 'kind', 'of', 'asbestos', '<unk>', 'found', 'in', 'most', 'schools', 'and', 'other', 'buildings'], ['asbestos', 'including', '<unk>', 'more', '<unk>', 'than', 'the', 'common', 'kind', 'of', 'asbestos', '<unk>', 'found', 'in', 'most', 'schools', 'and', 'other', 'buildings', 'dr'])
Training sample encoded: (tensor([ 47,  67, 209,   4,  85,   4,  86,  37, 246, 247,  47,  67,   4, 248,
        115, 249, 178,  54, 250, 251]), tensor([ 67, 209,   4,  85,   4,  86,  37, 246, 247,  47,  67,   4, 248, 115,
        249, 178,  54, 250, 251, 172]))


## Define LSTM Model

In [82]:
class ZarembaRNN(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_units=200, num_lstm_layers=2, dropout_rate= 0) -> None:
        super().__init__()
        self.hidden_units = hidden_units
        self.vocab_size = vocab_size
        self.embed_size = embedding_size
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        nn.init.xavier_uniform_(self.embedding.weight, generator=torch.Generator().manual_seed(1))
        self.lstm = nn.LSTM(embedding_size,hidden_units, num_lstm_layers, batch_first=True)
        nn.init.xavier_uniform_(self.lstm.weight_ih_l0, generator=torch.Generator().manual_seed(2))
        nn.init.xavier_uniform_(self.lstm.weight_hh_l0, generator=torch.Generator().manual_seed(3))
        self.fc = nn.Linear(hidden_units, vocab_size)
        nn.init.xavier_uniform_(self.fc.weight, generator=torch.Generator().manual_seed(4))
        self.dropout_rate = dropout_rate
        if self.dropout_rate > 0:
            self.dropout = nn.Dropout(self.dropout_rate)

    def forward(self, x, hidden):

        hidden = (hidden[0].contiguous(), hidden[1].contiguous())
        x = self.embedding(x)
        x, hidden = self.lstm(x, hidden)
        if self.dropout_rate > 0:
            x = self.dropout(x)
            x = self.fc(x)
        else:
            x = self.fc(x)


        return x, hidden



In [83]:
def train_epoch(network, dataloader, loss_fn, optimizer, device, epoch, verbosity: int):
    """Train one epoch of a network"""

    network.train()
    batch_loss = 0

    # iterate over all batches
    for i, data in enumerate(dataloader):
        #reset hidden state for batch
        hidden= (torch.zeros(2, batch_size, 200).to(device), torch.zeros(2, batch_size, 200).to(device))

        inputs, labels = data
        inputs = inputs.to(device)
        labels = labels.to(device)

        if inputs.size(0) != batch_size:
            hidden = (hidden[0][:, :inputs.size(0), :], hidden[1][:, :inputs.size(0), :])

        optimizer.zero_grad()
        outputs, hidden = network(inputs, hidden)

        hidden = (hidden[0].detach(), hidden[1].detach())



        output_reshaped = outputs.view(-1, outputs.size(-1))
        labels_reshaped = labels.view(-1)

        loss = loss_fn(output_reshaped, labels_reshaped)
        loss.backward()

        #gradient clipping
        torch.nn.utils.clip_grad_norm_(network.parameters(), max_norm=2)

        optimizer.step()

        batch_loss += loss.item()
        if i % verbosity == verbosity - 1:
            print(f'Batch #{i + 1} Loss: {batch_loss / verbosity}')
            batch_loss = 0

def perplexity(loss, batches):
    return math.exp(loss / batches)

def eval_network(title, network, dataloader, loss_fn, epoch):
    """Evaluate model and log metrics to wandb"""

    network.eval()
    total = 0
    loss = 0

    with torch.no_grad():
        for data in dataloader:
            data, labels = data
            data = data.to(device)
            labels = labels.to(device)
            hidden= (torch.zeros(2, batch_size, 200).to(device), torch.zeros(2, batch_size, 200).to(device))
            if data.size(0) != batch_size:
              hidden = (hidden[0][:, :data.size(0), :], hidden[1][:, :data.size(0), :])

            outputs, hidden = network(data, hidden)

            output_reshaped = outputs.view(-1, outputs.size(-1))
            labels_reshaped = labels.view(-1)

            hidden = (hidden[0].detach(), hidden[1].detach())

            loss += loss_fn(output_reshaped, labels_reshaped)
            total += labels.size(0)

        perp = perplexity(loss, len(dataloader))

        wandb.log({
            f'{title}-loss': loss / len(dataloader),
            f'{title}-perplexity': perp
        }, step=epoch)

    print(f'\033[92m{title} perplexity: {perp:.6f} ||| loss {loss / len(dataloader):.6f}\033[0m')

    return perp

def train_network(network, dataloaders, loss_fn, optimizer, schedule, device, epochs: int, verbosity: int):
    for epoch in range(epochs):
        lr = optimizer.param_groups[0]['lr']

        print(f'----------- Epoch #{epoch + 1}, LR: {lr} ------------')
        train_epoch(network, dataloaders['train'], loss_fn, optimizer, device, epoch, verbosity)
        train_perplexity = eval_network('Train', network, dataloaders['train'], loss_fn, epoch)
        val_perplexity = eval_network('Validation', network, dataloaders['val'], loss_fn, epoch)
        test_perplexity = eval_network('Test', network, dataloaders['test'], loss_fn, epoch)
        print('------------------------------------\n')

        schedule.step()
    print('----------- Train Complete! ------------')
    return {
        'train':train_perplexity,
        'val':val_perplexity,
        'test':test_perplexity
    }


In [92]:
decay_start = 4
learning_rate_decay = 0.5
embedding_size = 200
momentum=0.5
lr_start=4
dropout_rate = 0

def lr_lambda(epoch):
    if epoch < decay_start:
        return 1
    else:
        return learning_rate_decay ** (epoch - (decay_start-1))

model = ZarembaRNN(len(train.vocab), embedding_size)
model = model.to(device)
cross_entropy = nn.CrossEntropyLoss()
sgd = optim.SGD(model.parameters(), lr=lr_start, momentum=momentum)
schedule = optim.lr_scheduler.LambdaLR(sgd, lr_lambda)
# device = torch.device('cpu')

In [93]:
run = wandb.init(project="dl-assignment2-redux", config={
    'batch_size':batch_size,
    'embedding_size':embedding_size,
    'hidden_units':200,
    'num_lstm_layers':2,
    'dropout_rate':dropout_rate,
    'epoch_decay_at':decay_start,
    'learning_rate_decay':learning_rate_decay,
    'learning_rate_start':lr_start,
    'momentum':momentum,
    'optimizer':'SGD',
    'seq_len':seq_len
})
results = train_network(model, dataloaders, cross_entropy, sgd, schedule, device, 14, 500)
run.finish()

VBox(children=(Label(value='0.012 MB of 0.012 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Test-loss,▁█
Test-perplexity,▁█
Train-loss,█▁
Train-perplexity,█▁
Validation-loss,▁█
Validation-perplexity,▁█

0,1
Test-loss,5.29157
Test-perplexity,198.65473
Train-loss,3.91129
Train-perplexity,49.96361
Validation-loss,5.34337
Validation-perplexity,209.21648


----------- Epoch #1, LR: 4 ------------
Batch #500 Loss: 6.585322054862976
Batch #1000 Loss: 5.951709095954895
Batch #1500 Loss: 5.580817513465881
[92mTrain perplexity: 214.617297 ||| loss 5.368856[0m
[92mValidation perplexity: 254.992106 ||| loss 5.541233[0m
[92mTest perplexity: 246.627186 ||| loss 5.507878[0m
------------------------------------

----------- Epoch #2, LR: 4 ------------
Batch #500 Loss: 5.205941417694092
Batch #1000 Loss: 5.029384421348571
Batch #1500 Loss: 4.871192534446716
[92mTrain perplexity: 112.797862 ||| loss 4.725597[0m
[92mValidation perplexity: 191.102786 ||| loss 5.252811[0m
[92mTest perplexity: 184.418023 ||| loss 5.217205[0m
------------------------------------

----------- Epoch #3, LR: 4 ------------
Batch #500 Loss: 4.658250547409057
Batch #1000 Loss: 4.544675675392151
Batch #1500 Loss: 4.438771874427795
[92mTrain perplexity: 75.808061 ||| loss 4.328205[0m
[92mValidation perplexity: 192.898587 ||| loss 5.262165[0m
[92mTest perplexity

KeyboardInterrupt: 

In [74]:
run.finish()

VBox(children=(Label(value='0.014 MB of 0.014 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Test-loss,█▃▂▁▁▂▃
Test-perplexity,█▃▁▁▁▂▃
Train-loss,█▆▅▄▃▂▁
Train-perplexity,█▅▃▃▂▁▁
Validation-loss,█▃▂▁▁▃▄
Validation-perplexity,█▃▁▁▁▂▃

0,1
Test-loss,5.31243
Test-perplexity,202.84242
Train-loss,3.79466
Train-perplexity,44.46323
Validation-loss,5.35984
Validation-perplexity,212.69039


In [None]:
for data in train_loader:
    inputs, labels = data
    print(inputs.shape)
    break

torch.Size([20, 20])


In [None]:
def lr_lambda(epoch):
    if epoch < 7:
        return 1
    else:
        return 0.5 ** (epoch - 6)

model = ZarembaRNN(len(train.vocab), 10)
model = model.to(device)
cross_entropy = nn.CrossEntropyLoss()
adam = optim.Adam(model.parameters(), lr=1e-2)
schedule = optim.lr_scheduler.LambdaLR(adam, lr_lambda)
# device = torch.device('cpu')


In [None]:
for data in train_loader:
    inputs, labels = data
    inputs = inputs.to(device)
    labels = labels.to(device)
    outputs, hidden = model(inputs.to(device), (torch.zeros(2, batch_size, 200).to(device), torch.zeros(2, batch_size, 200).to(device)))
    # outputs, hidden = network(inputs, hidden)

    hidden = (hidden[0].detach(), hidden[1].detach())



    output_reshaped = outputs.view(-1, outputs.size(-1))
    labels_reshaped = labels.view(-1)
    # print(output_reshaped.device)
    # print(labels_reshaped.device)

    loss = nn.CrossEntropyLoss()(output_reshaped, labels_reshaped)
    print(loss)
    break

tensor(9.1814, device='cuda:0', grad_fn=<NllLossBackward0>)
