In [1]:
!pip install torch datasets



In [2]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import math
import time
from datasets import load_dataset
from io import open
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:1024'


In [4]:
# Hyperparameters
nhead = 4
ninp = 128
em_size = 200
nhid = 200
nlayers = 4
dropout = 0.2
bptt = 150
batch_size = 32
eval_batch_size = 16
learning_rate = 20
clip = 0.25
epochs = 40
torch.manual_seed(1332)
log_interval = 250
data_dir = './data/wikitext/'
models_dir = './models/'
out_dir = './output/'


In [5]:
if torch.cuda.is_available():
    cuda = torch.device('cuda:0')
    # cuda2 = torch.device('cuda:1')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
cpu = torch.device('cpu')

cuda, cpu

(device(type='cuda', index=0), device(type='cpu'))

In [6]:
## Taken from https://github.com/pytorch/examples/blob/main/word_language_model/data.py on 01/11/2023
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []
        self.counter = 0

    def add_word(self, word):
        if word not in self.idx2word:
            self.idx2word.append(word)
            self.word2idx[word] = self.counter
            self.counter += 1

    def __len__(self):
        return len(self.idx2word)

In [7]:
def tokenise_prompt(input, corpus):
    words = input.split() + ['<eos>']
    ids = torch.LongTensor(len(words))
    for i, word in enumerate(words):
        ids[i] = corpus.dictionary.word2idx[word]
    return ids

In [8]:
## Taken from https://github.com/pytorch/examples/blob/main/word_language_model/data.py on 01/11/2023
class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path, 'train.txt'))
        self.valid = self.tokenize(os.path.join(path, 'valid.txt'))
        self.test = self.tokenize(os.path.join(path, 'test.txt'))

    def tokenize(self, path):
        assert os.path.exists(path)
        with open(path, 'r', encoding='utf-8') as f:
            tokens = 0
            for line in f:
                words = line.split() + ['<eos>']
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)

            ids = torch.LongTensor(tokens)
            token = 0
            with open(path, 'r', encoding='utf-8') as f:
                for line in f:
                    words = line.split() + ['<eos>']
                    for word in words:
                        ids[token] = self.dictionary.word2idx[word]
                        token += 1

            return ids.to(cpu)

In [9]:
if not os.path.exists(data_dir):
    # os.mkdir(data_dir)
    data = load_dataset('wikitext', 'wikitext-2-v1')
    data.save_to_disk(os.path.join(data_dir, 'wikitext-2'))
data = load_dataset('wikitext', 'wikitext-2-v1', data_dir=data_dir)



In [10]:
data

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [11]:
## Converting from HF dataset to plain text files can probably improve this
train_data = '\n'.join(data['train']['text'])
valid_data = '\n'.join(data['validation']['text'])
test_data = '\n'.join(data['test']['text'])
with open(os.path.join(data_dir, 'train.txt'), 'w') as f:
    f.write(train_data)
with open(os.path.join(data_dir, 'valid.txt'), 'w') as f:
    f.write(valid_data)
with open(os.path.join(data_dir, 'test.txt'), 'w') as f:
    f.write(test_data)

In [12]:
torch.cuda.empty_cache()

In [13]:
corpus = Corpus(data_dir)

In [14]:
def batchify(data, bsz):
    nbatch = data.size(0) // bsz
    data = data.narrow(0, 0, nbatch*bsz)
    data = data.view(bsz, -1).t().contiguous()
    return data.to(cuda)


In [15]:

train_data = batchify(corpus.train, batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

In [16]:
class LSTMModel(nn.Module):
    def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5):
        super(LSTMModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.lstm = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)
        self.init_weights()
        self.nhid = nhid
        self.nlayers = nlayers
        self.ntokens = ntoken
        self.model_type = 'LSTM'

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.lstm(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output)
        decoded = decoded.view(-1, self.ntokens)
        return F.log_softmax(decoded, dim=1), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters())
        return (weight.new_zeros(self.nlayers, bsz, self.nhid),
                weight.new_zeros(self.nlayers, bsz, self.nhid))

In [17]:
class PoisitionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PoisitionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2)*(-math.log(10000.0)/d_model))
        pe[:, 0::2] = torch.sin(position*div_term)
        pe[:, 1::2] = torch.cos(position*div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [18]:
class TransformerModel(nn.Transformer):
    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__(d_model=ninp, nhead=nhead, dim_feedforward=nhid, num_decoder_layers=nlayers)
        # self.encoder = nn.Embedding(ntoken, ninp)
        self.src_mask = None
        self.pos_encoder = PoisitionalEncoding(ninp, dropout)

        self.input_emb = nn.Embedding(ntoken, ninp)
        self.decoder = nn.Linear(ninp, ntoken)
        self.ninp = ninp
        self.nhid = nhid
        self.nlayers = nlayers
        self.ntokens = ntoken
        self.model_type = 'Transformer'

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz) == 1).transpose(0, 1))
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.input_emb.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, has_mask=True):
        if has_mask:
            device = src.device
            if self.src_mask is None or self.src_mask.size(0) != len(src):
                mask = self._generate_square_subsequent_mask(len(src)).to(device)
                self.src_mask = mask
        else:
            self.src_mask = None

        src = self.input_emb(src)*math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.encoder(src, mask=self.src_mask)
        output = self.decoder(output)
        return F.log_softmax(output, dim=-1)

In [19]:

ntokens = len(corpus.dictionary)
transformer_model = TransformerModel(ntoken=ntokens, ninp=ninp, nhead=nhead, nhid=nhid, nlayers=nlayers, dropout=dropout).to(cuda)
lstm_model = LSTMModel(ntoken=ntokens, ninp=ninp, nhid=nhid, nlayers=nlayers,dropout=dropout).to(cuda)

criterion = nn.NLLLoss()




In [20]:
def repackage_hidden(h):
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)


In [21]:
def get_batch(source, i):
    seq_len = min(bptt, len(source)-1-i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

In [22]:
def evaluate(model, data_source):
    model.eval()
    total_loss = 0
    model_type = model.model_type
    ntokens = len(corpus.dictionary)
    if model_type != 'Transformer':
        hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0)-1, bptt):
            data, targets = get_batch(data_source, i)
            if model_type == 'Transformer':
                output = model(data)
                output = output.view(-1, ntokens)
            else:
                output, hidden = model(data, hidden)
                hidden = repackage_hidden(hidden)
            total_loss += len(data)*criterion(output, targets).item()
    return total_loss/(len(data_source)-1)

In [23]:
def train(model, epoch, lr):
    model.train()
    total_loss = 0
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    if model.model_type != 'Transformer':
        hidden = model.init_hidden(batch_size)
    for batch, i in enumerate(range(0, train_data.size(0)-1, bptt)):
        data, targets = get_batch(train_data, i)
        model.zero_grad()
        if model.model_type == 'Transformer':
            output = model(data)
            output = output.view(-1, ntokens)
        else:
            output, hidden = model(data, hidden)
            hidden = repackage_hidden(hidden)
        loss = criterion(output, targets)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        for p in model.parameters():
            p.data.add_(p.grad, alpha=-lr)

        total_loss += loss.item()

        if batch%log_interval == 0 and batch > 0:
            cur_loss = total_loss/log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                        epoch, batch, len(train_data)//bptt, elapsed*1000/log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()



In [24]:
def export_onnx(path, model, batch_size, seq_len):
    model.eval()
    x = torch.rand(seq_len, batch_size).to(device)
    if model.model_type != "Transformer":
        hidden = model.init_hidden(batch_size)
        torch.onnx.export(model, (x, hidden), path)

In [25]:
def run_training(model, epochs, learning_rate):
    torch.cuda.empty_cache()
    lr = learning_rate
    best_val_loss = None
    model_type = model.model_type
    model_path = os.path.join(models_dir, (model_type + '.pt'))
    for epoch in range(1, epochs+1):
        torch.cuda.empty_cache()
        epoch_start_time = time.time()
        train(model, epoch, lr)
        val_loss = evaluate(model, val_data)
        print('-'*89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time()-epoch_start_time), val_loss, math.exp(val_loss)))
        print('-'*89)
        if not best_val_loss or val_loss < best_val_loss:
            with open(model_path, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            lr /= 4.0

    test_loss = evaluate(model, test_data)
    print('='*89)
    print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(test_loss, math.exp(test_loss)))
    print('='*89)
    # export_onnx(os.path.join(models_dir, model_type), model, batch_size=1, seq_len=bptt)


In [26]:
torch.cuda.empty_cache()
mem_rep_cuda = torch.cuda.memory_summary(device=cuda, abbreviated=False)
print(mem_rep_cuda)

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      | 107244 KiB | 107244 KiB | 112047 KiB |   4803 KiB |
|       from large pool | 104214 KiB | 104214 KiB | 104214 KiB |      0 KiB |
|       from small pool |   3030 KiB |   7703 KiB |   7833 KiB |   4803 KiB |
|---------------------------------------------------------------------------|
| Active memory         | 107244 KiB | 107244 KiB | 112047 KiB |   4803 KiB |
|       from large pool | 104214 KiB | 104214 KiB | 104214 KiB |      0 KiB |
|       from small pool |   3030 KiB |   7703 KiB |   7833 KiB |   4803 KiB |
|---------------------------------------------------------------

In [27]:
print(torch.cuda.memory_reserved(device=cuda))
print(torch.cuda.max_memory_reserved(device=cuda))

132120576
136314880


In [28]:
run_training(transformer_model, epochs, learning_rate)

| epoch   1 |   250/  440 batches | ms/batch 31.08 | loss 59.33 | ppl 58202891542373762657681408.00
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 14.33s | valid loss 28.51 | valid ppl 2404605230324.68
-----------------------------------------------------------------------------------------
| epoch   2 |   250/  440 batches | ms/batch 31.12 | loss 58.52 | ppl 25944479316779617092632576.00
-----------------------------------------------------------------------------------------
| end of epoch   2 | time: 14.30s | valid loss 50.11 | valid ppl 5779969503360030081024.00
-----------------------------------------------------------------------------------------
| epoch   3 |   250/  440 batches | ms/batch 31.07 | loss 11.14 | ppl 68597.66
-----------------------------------------------------------------------------------------
| end of epoch   3 | time: 14.35s | valid loss 11.79 | valid ppl 131612.68
-----------------------

In [29]:
run_training(lstm_model, epochs, learning_rate)

| epoch   1 |   250/  440 batches | ms/batch 57.45 | loss  7.68 | ppl  2164.50
-----------------------------------------------------------------------------------------
| end of epoch   1 | time: 25.86s | valid loss  7.27 | valid ppl  1434.01
-----------------------------------------------------------------------------------------
| epoch   2 |   250/  440 batches | ms/batch 56.15 | loss  7.22 | ppl  1360.10
-----------------------------------------------------------------------------------------
| end of epoch   2 | time: 25.51s | valid loss  7.12 | valid ppl  1234.18
-----------------------------------------------------------------------------------------
| epoch   3 |   250/  440 batches | ms/batch 56.14 | loss  7.04 | ppl  1146.76
-----------------------------------------------------------------------------------------
| end of epoch   3 | time: 25.53s | valid loss  6.36 | valid ppl   578.23
-----------------------------------------------------------------------------------------
|

In [30]:
input = tokenise_prompt('The meaning of life is', corpus).to(cuda)
input

tensor([  83,  427,   16, 1127,   26,    0], device='cuda:0')

In [31]:
input2 = torch.randint(ntokens, (1, 1), dtype=torch.long).to(cuda)
input2

tensor([[2308]], device='cuda:0')

In [32]:
def generate_text(model, prompt, output_file, corpus=corpus, temp=1, device=cuda):
    model.eval()
    ntokens = len(corpus.dictionary)
    if model.model_type != 'Transformer':
        hidden = model.init_hidden(1)
        # lstm_model.flatten_parameters()
    model.to(cuda)
    # input = tokenise_prompt(prompt, corpus).to(device)
    input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(cuda)
    with open(output_file, 'w') as outf:
        with torch.no_grad():
            for i in range(1000):
                if model.model_type == 'Transformer':
                    output = model(input, False)
                    word_weights = output[-1].squeeze().div(temp).exp().cpu()
                    word_idx = torch.multinomial(word_weights, 1)[0]
                    word_tensor = torch.Tensor([[word_idx]]).long().to(device)
                    input = torch.cat([input, word_tensor], 0)
                else:
                    output, hidden = model(input, hidden)
                    word_weights = output.squeeze().div(temp).exp().cpu()
                    word_idx = torch.multinomial(word_weights, 1)[0]
                    input.fill_(word_idx)

                word = corpus.dictionary.idx2word[word_idx]

                outf.write(word + ('\n' if i % 20 == 19 else ' '))
                if i % 100 == 0:
                    print('| Generated {}/{} words'.format(i, 1000))
            print('Done')

In [33]:
prompt = "the best place to buy"

In [34]:
if transformer_model == None:
    transformer_model = torch.load(os.path.join(models_dir, 'Transformer.pt'), map_location=device)
if lstm_model == None:
    lstm_model = torch.load(os.path.join(models_dir, 'Transformer.pt'), map_location=device)

In [35]:
generate_text(transformer_model, prompt, './output/transformer_gen.txt')

| Generated 0/1000 words
| Generated 100/1000 words
| Generated 200/1000 words
| Generated 300/1000 words
| Generated 400/1000 words
| Generated 500/1000 words
| Generated 600/1000 words
| Generated 700/1000 words
| Generated 800/1000 words
| Generated 900/1000 words
Done


In [36]:
generate_text(lstm_model, prompt, './output/lstm_gen.txt')

| Generated 0/1000 words
| Generated 100/1000 words
| Generated 200/1000 words
| Generated 300/1000 words
| Generated 400/1000 words
| Generated 500/1000 words
| Generated 600/1000 words
| Generated 700/1000 words
| Generated 800/1000 words
| Generated 900/1000 words
Done
